In [1]:
import quandl
import pandas as pd
import numpy as np
import datetime as dt
import pandas_profiling
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
In [2]:
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import defaultdict
# Import supplementary visualization code visuals.py
#import visuals as vs
from numpy import concatenate

# magic word for producing visualizations in notebook.allow plots to appear directly in the notebook
%matplotlib inline
In [3]:
quandl.ApiConfig.api_key = "v5Bazu_S389s29HiutZh"
In [4]:
#Get data from Quandl APIs into dataframes
SP500_DIV_YIELD_MONTH = quandl.get('MULTPL/SP500_DIV_YIELD_MONTH') #MULTPLkeys[0]
SP500_PE_RATIO_MONTH = quandl.get('MULTPL/SP500_PE_RATIO_MONTH')
SHILLER_PE_RATIO_MONTH = quandl.get('MULTPL/SHILLER_PE_RATIO_MONTH')
SP500_EARNINGS_YIELD_MONTH = quandl.get('MULTPL/SP500_EARNINGS_YIELD_MONTH')
SP500_INFLADJ_MONTH = quandl.get('MULTPL/SP500_INFLADJ_MONTH')#MULTPLkeys[4]

SP500_PSR_QUARTER = quandl.get('MULTPL/SP500_PSR_QUARTER')
SP500_DIV_MONTH = quandl.get('MULTPL/SP500_DIV_MONTH')
SP500_DIV_YEAR = quandl.get('MULTPL/SP500_DIV_YEAR')
SP500_DIV_GROWTH_YEAR = quandl.get('MULTPL/SP500_DIV_GROWTH_YEAR')
SP500_DIV_GROWTH_QUARTER = quandl.get('MULTPL/SP500_DIV_GROWTH_QUARTER')
SP500_PBV_RATIO_QUARTER = quandl.get('MULTPL/SP500_PBV_RATIO_QUARTER') #MULTPLkeys[10]

SHILLER_PE_RATIO_YEAR = quandl.get('MULTPL/SHILLER_PE_RATIO_YEAR')
SP500_PE_RATIO_YEAR = quandl.get('MULTPL/SP500_PE_RATIO_YEAR')
SP500_DIV_YIELD_YEAR = quandl.get('MULTPL/SP500_DIV_YIELD_YEAR')
SP500_PSR_YEAR = quandl.get('MULTPL/SP500_PSR_YEAR')
SP500_EARNINGS_YIELD_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YIELD_YEAR')
SP500_PBV_RATIO_YEAR =  quandl.get('MULTPL/SP500_PBV_RATIO_YEAR')
SP500_INFLADJ_YEAR =  quandl.get('MULTPL/SP500_INFLADJ_YEAR')
SP500_REAL_PRICE_MONTH = quandl.get('MULTPL/SP500_REAL_PRICE_MONTH') #MULTPLkeys[18]
SP500_SALES_YEAR =  quandl.get('MULTPL/SP500_SALES_YEAR')

SP500_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_SALES_GROWTH_YEAR') #MULTPLkeys[20]
SP500_SALES_QUARTER =  quandl.get('MULTPL/SP500_SALES_QUARTER')
SP500_REAL_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_QUARTER')
SP500_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_SALES_GROWTH_QUARTER')
SP500_REAL_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_YEAR')
SP500_REAL_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR')
SP500_REAL_SALES_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_YEAR')
SP500_REAL_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER')
SP500_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_QUARTER')
SP500_REAL_SALES_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_QUARTER')

SP500_EARNINGS_MONTH = quandl.get('MULTPL/SP500_EARNINGS_MONTH') #MULTPLkeys[30]
SP500_BVPS_YEAR = quandl.get('MULTPL/SP500_BVPS_YEAR')
SP500_EARNINGS_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YEAR')
SP500_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_YEAR')
SP500_BVPS_QUARTER = quandl.get('MULTPL/SP500_BVPS_QUARTER')
SP500_REAL_PRICE_YEAR = quandl.get('MULTPL/SP500_REAL_PRICE_YEAR') #MULTPLkeys[35]
In [5]:
#API keywords list
MULTPLkeys = [
'MULTPL/SP500_DIV_YIELD_MONTH',
'MULTPL/SP500_PE_RATIO_MONTH',
'MULTPL/SHILLER_PE_RATIO_MONTH',
'MULTPL/SP500_EARNINGS_YIELD_MONTH',
'MULTPL/SP500_INFLADJ_MONTH',
'MULTPL/SP500_PSR_QUARTER',
'MULTPL/SP500_DIV_MONTH',
'MULTPL/SP500_DIV_YEAR',
'MULTPL/SP500_DIV_GROWTH_YEAR',
'MULTPL/SP500_DIV_GROWTH_QUARTER',
'MULTPL/SP500_PBV_RATIO_QUARTER',
'MULTPL/SHILLER_PE_RATIO_YEAR',
'MULTPL/SP500_PE_RATIO_YEAR',
'MULTPL/SP500_DIV_YIELD_YEAR',
'MULTPL/SP500_PSR_YEAR',
'MULTPL/SP500_EARNINGS_YIELD_YEAR',
'MULTPL/SP500_PBV_RATIO_YEAR',
'MULTPL/SP500_INFLADJ_YEAR',
'MULTPL/SP500_REAL_PRICE_MONTH',
'MULTPL/SP500_SALES_YEAR',
'MULTPL/SP500_SALES_GROWTH_YEAR',
'MULTPL/SP500_SALES_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_QUARTER',
'MULTPL/SP500_SALES_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_REAL_SALES_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_QUARTER',
'MULTPL/SP500_EARNINGS_MONTH',
'MULTPL/SP500_BVPS_YEAR',
'MULTPL/SP500_EARNINGS_YEAR',
'MULTPL/SP500_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_BVPS_QUARTER',
'MULTPL/SP500_REAL_PRICE_YEAR'
]

#Split API keywords to create df variable names
var_list = [i.split('/')[1] for i in MULTPLkeys]
print(var_list[0],var_list[1],var_list[2])
SP500_DIV_YIELD_MONTH SP500_PE_RATIO_MONTH SHILLER_PE_RATIO_MONTH
In [6]:
# def get_data(i):
#     return quandl.get('MULTPL/{}'.format(i))
In [7]:
var_list
Out[7]:
['SP500_DIV_YIELD_MONTH',
 'SP500_PE_RATIO_MONTH',
 'SHILLER_PE_RATIO_MONTH',
 'SP500_EARNINGS_YIELD_MONTH',
 'SP500_INFLADJ_MONTH',
 'SP500_PSR_QUARTER',
 'SP500_DIV_MONTH',
 'SP500_DIV_YEAR',
 'SP500_DIV_GROWTH_YEAR',
 'SP500_DIV_GROWTH_QUARTER',
 'SP500_PBV_RATIO_QUARTER',
 'SHILLER_PE_RATIO_YEAR',
 'SP500_PE_RATIO_YEAR',
 'SP500_DIV_YIELD_YEAR',
 'SP500_PSR_YEAR',
 'SP500_EARNINGS_YIELD_YEAR',
 'SP500_PBV_RATIO_YEAR',
 'SP500_INFLADJ_YEAR',
 'SP500_REAL_PRICE_MONTH',
 'SP500_SALES_YEAR',
 'SP500_SALES_GROWTH_YEAR',
 'SP500_SALES_QUARTER',
 'SP500_REAL_SALES_GROWTH_QUARTER',
 'SP500_SALES_GROWTH_QUARTER',
 'SP500_REAL_SALES_GROWTH_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_YEAR',
 'SP500_REAL_SALES_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'SP500_EARNINGS_GROWTH_QUARTER',
 'SP500_REAL_SALES_QUARTER',
 'SP500_EARNINGS_MONTH',
 'SP500_BVPS_YEAR',
 'SP500_EARNINGS_YEAR',
 'SP500_EARNINGS_GROWTH_YEAR',
 'SP500_BVPS_QUARTER',
 'SP500_REAL_PRICE_YEAR']
In [8]:
# var_list1 = [
#     'SP500_DIV_YIELD_MONTH1',
#     'SP500_PE_RATIO_MONTH2',
#     'SHILLER_PE_RATIO_MONTH']
In [9]:
# for i in var_list:
#     #i = pd.DataFrame(get_data(i))
#     exec(f'{i} = get_data(i)')
#     #print(get_data(i))
#     print(i)
#     break
In [10]:
# x= 'SP500_DIV_YIELD_MONTH'
# exec("%s = %d" % (x,0))
# print(SP500_DIV_YIELD_MONTH)
# print(x)

#exec(f'{i} = get_data(i)')
In [11]:
# for i in range(len(var_list)//10):
#     x= var_list[i]    
#     exec("%s = %d" % (x,0))
#     #print(var_list[i])
#     #print(x)
#     print(SP500_DIV_YIELD_MONTH)

# # SP500_DIV_YIELD_MONTH = quandl.get(str(MULTPLkeys[0]))

# # print(SP500_DIV_YIELD_MONTH.head())
# #SP500_DIV_YIELD_MONTH
# #SP500_PE_RATIO_MONTH
# #SHILLER_PE_RATIO_MONTH

# #dynamic variables
# x= var_list[1]    
# exec("%s = %s" % (x,x))
# SP500_PE_RATIO_MONTH
In [12]:
print(SP500_REAL_PRICE_MONTH.head())
print(SP500_EARNINGS_YIELD_MONTH.head())
            Value
Date             
1871-01-01   4.44
1871-02-01   4.50
1871-03-01   4.61
1871-04-01   4.74
1871-05-01   4.86
            Value
Date             
1871-01-01   9.01
1871-02-01   8.89
1871-03-01   8.68
1871-04-01   8.44
1871-05-01   8.23

df = pd.merge(pd.merge(SP500_REAL_PRICE_MONTH,SP500_EARNINGS_YIELD_MONTH,on='Date'),SHILLER_PE_RATIO_MONTH,on='Date') df.head()

Assumptions-

  1. Assuming SP500_REAL_PRICE_MONTH is remains same on 1st of every month and last day of that month to remove NaN values.
  2. SP500_DIV_YIELD_MONTH and yield values are available for last day of every month. Assuming it remains the same on first of next month by carrying over the same value to first day of next month.
outer join between price and monthly yield df to include all dates of every month.

SP500_REAL_PRICE_MONTH is the target label hence start with this df to include all rows 
In [13]:
SP500_DIV_YIELD_MONTH.head()
Out[13]:
Value
Date
1871-01-31 5.86
1871-02-28 5.78
1871-03-31 5.64
1871-04-30 5.49
1871-05-31 5.35
In [14]:
#Join all month dataframes together on Date column.

df = SP500_REAL_PRICE_MONTH.join(SP500_DIV_YIELD_MONTH,on=None,how='outer',
                                  lsuffix='_SP500_REAL_PRICE_MONTH',rsuffix='_SP500_DIV_YIELD_MONTH',sort=False)

df= df.join(SP500_PE_RATIO_MONTH,on='Date',how='left',sort=False)

df = df.merge(
    SHILLER_PE_RATIO_MONTH,on='Date',how='left').merge(
    SP500_EARNINGS_YIELD_MONTH,on='Date',how='left').merge(
    SP500_INFLADJ_MONTH,on='Date',how='left').merge(
    SP500_PSR_QUARTER,on='Date',how='left').merge(
    SP500_DIV_MONTH,on='Date',how='outer').merge(
    SP500_DIV_YEAR,on='Date',how='left')


df.columns = ['Value_SP500_REAL_PRICE_MONTH','Value_SP500_DIV_YIELD_MONTH','Value_SP500_PE_RATIO_MONTH',
                'Value_SHILLER_PE_RATIO_MONTH','Value_SP500_EARNINGS_YIELD_MONTH','Value_SP500_INFLADJ_MONTH',
              'Value_SP500_PSR_QUARTER','Value_SP500_DIV_MONTH','Value_SP500_DIV_YEAR'
             ]

#print(df.head(6))
print(df.columns)
print(df.shape)
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR'],
      dtype='object')
(3548, 9)
In [15]:
def Merge__Rename_function(df,df_var,column_name):
    df = df.merge(df_var,on='Date',how='outer')
    Column_name = 'Value_' + str(column_name)
    #print(Column_name)
    df.rename({'Value':Column_name},axis=1,inplace=True)
    #print(df.head(6))
    print(df.shape)
    #print(df.columns)
    
    return df
In [16]:
var_list
Out[16]:
['SP500_DIV_YIELD_MONTH',
 'SP500_PE_RATIO_MONTH',
 'SHILLER_PE_RATIO_MONTH',
 'SP500_EARNINGS_YIELD_MONTH',
 'SP500_INFLADJ_MONTH',
 'SP500_PSR_QUARTER',
 'SP500_DIV_MONTH',
 'SP500_DIV_YEAR',
 'SP500_DIV_GROWTH_YEAR',
 'SP500_DIV_GROWTH_QUARTER',
 'SP500_PBV_RATIO_QUARTER',
 'SHILLER_PE_RATIO_YEAR',
 'SP500_PE_RATIO_YEAR',
 'SP500_DIV_YIELD_YEAR',
 'SP500_PSR_YEAR',
 'SP500_EARNINGS_YIELD_YEAR',
 'SP500_PBV_RATIO_YEAR',
 'SP500_INFLADJ_YEAR',
 'SP500_REAL_PRICE_MONTH',
 'SP500_SALES_YEAR',
 'SP500_SALES_GROWTH_YEAR',
 'SP500_SALES_QUARTER',
 'SP500_REAL_SALES_GROWTH_QUARTER',
 'SP500_SALES_GROWTH_QUARTER',
 'SP500_REAL_SALES_GROWTH_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_YEAR',
 'SP500_REAL_SALES_YEAR',
 'SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'SP500_EARNINGS_GROWTH_QUARTER',
 'SP500_REAL_SALES_QUARTER',
 'SP500_EARNINGS_MONTH',
 'SP500_BVPS_YEAR',
 'SP500_EARNINGS_YEAR',
 'SP500_EARNINGS_GROWTH_YEAR',
 'SP500_BVPS_QUARTER',
 'SP500_REAL_PRICE_YEAR']
In [17]:
df = Merge__Rename_function(df,SP500_DIV_GROWTH_YEAR,'SP500_DIV_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_DIV_GROWTH_QUARTER,'SP500_DIV_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_PBV_RATIO_QUARTER,'SP500_PBV_RATIO_QUARTER')

df = Merge__Rename_function(df,SHILLER_PE_RATIO_YEAR,'SHILLER_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_PE_RATIO_YEAR,'SP500_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_DIV_YIELD_YEAR,'SP500_DIV_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PSR_YEAR,'SP500_PSR_YEAR')
df = Merge__Rename_function(df,SP500_EARNINGS_YIELD_YEAR,'SP500_EARNINGS_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PBV_RATIO_YEAR,'SP500_PBV_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_INFLADJ_YEAR,'SP500_INFLADJ_YEAR')

df = Merge__Rename_function(df,SP500_SALES_YEAR,'SP500_SALES_YEAR')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_YEAR,'SP500_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_SALES_QUARTER,'SP500_SALES_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_QUARTER,'SP500_REAL_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_QUARTER,'SP500_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_YEAR,'SP500_REAL_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_YEAR,'SP500_REAL_EARNINGS_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_SALES_YEAR,'SP500_REAL_SALES_YEAR')

df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_QUARTER,var_list[27])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_QUARTER,var_list[28])
df = Merge__Rename_function(df,SP500_REAL_SALES_QUARTER,var_list[29])
df = Merge__Rename_function(df,SP500_EARNINGS_MONTH,var_list[30])
df = Merge__Rename_function(df,SP500_BVPS_YEAR,var_list[31])
df = Merge__Rename_function(df,SP500_EARNINGS_YEAR,var_list[32])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_YEAR,var_list[33])
df = Merge__Rename_function(df,SP500_BVPS_QUARTER,var_list[34])
#df = Merge__Rename_function(df,SP500_REAL_PRICE_YEAR,var_list[35])

df.columns
(3553, 10)
(3559, 11)
(3559, 12)
(3559, 13)
(3559, 14)
(3559, 15)
(3559, 16)
(3559, 17)
(3559, 18)
(3559, 19)
(3559, 20)
(3559, 21)
(3559, 22)
(3559, 23)
(3559, 24)
(3559, 25)
(3559, 26)
(3559, 27)
(3559, 28)
(3559, 29)
(3559, 30)
(3559, 31)
(3559, 32)
(3559, 33)
(3559, 34)
(3559, 35)
Out[17]:
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
       'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
       'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
       'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
       'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
       'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
       'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_QUARTER',
       'Value_SP500_SALES_GROWTH_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
       'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
       'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
       'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
       'Value_SP500_BVPS_QUARTER'],
      dtype='object')
In [18]:
df.head()
Out[18]:
Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR ... Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 NaN 11.10 NaN 9.01 89.81 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-01-31 NaN 5.86 NaN NaN NaN NaN NaN 5.26 5.15 NaN ... NaN NaN NaN NaN NaN 8.09 NaN 7.92 NaN NaN
1871-02-01 4.50 NaN 11.25 10.92 8.89 88.33 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-02-28 NaN 5.78 NaN NaN NaN NaN NaN 5.10 NaN NaN ... NaN NaN NaN NaN NaN 7.85 NaN NaN NaN NaN
1871-03-01 4.61 NaN 11.52 11.19 8.68 89.17 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 35 columns

In [19]:
#Pandas profiling of df
pandas_profiling.ProfileReport(df)
Out[19]:

Overview

Dataset info

Number of variables 36
Number of observations 3559
Total Missing (%) 15.2%
Total size in memory 1001.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 8
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 27
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3559
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 1189
Unique (%) 33.4%
Missing (%) 50.2%
Missing (n) 1787
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.57
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 7.982
Q1 11.627
Median 15.67
Q3 20.15
95-th percentile 27.901
Maximum 44.19
Range 39.41
Interquartile range 8.5225

Descriptive statistics

Standard deviation 6.6713
Coef of variation 0.40262
Kurtosis 1.99
Mean 16.57
MAD 5.1052
Skewness 1.1235
Sum 29362
Variance 44.506
Memory size 27.9 KiB
Value Count Frequency (%)  
11.34 6 0.2%
 
17.82 6 0.2%
 
16.6 5 0.1%
 
12.05 5 0.1%
 
13.8 5 0.1%
 
17.05 4 0.1%
 
18.2 4 0.1%
 
15.27 4 0.1%
 
16.16 4 0.1%
 
15.47 4 0.1%
 
Other values (1178) 1725 48.5%
 
(Missing) 1787 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 
5.08 1 0.0%
 
5.12 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.22 1 0.0%
 
43.53 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.9637

Value_SP500_DIV_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 34
Unique (%) 1.0%
Missing (%) 99.0%
Missing (n) 3522
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.5478
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile -2.668
Q1 3.07
Median 7.07
Q3 11.45
95-th percentile 16.26
Maximum 18.25
Range 39.32
Interquartile range 8.38

Descriptive statistics

Standard deviation 6.9429
Coef of variation 1.0603
Kurtosis 5.896
Mean 6.5478
MAD 4.8668
Skewness -1.6172
Sum 242.27
Variance 48.204
Memory size 27.9 KiB
Value Count Frequency (%)  
7.99 2 0.1%
 
5.33 2 0.1%
 
7.07 2 0.1%
 
16.26 2 0.1%
 
9.33 1 0.0%
 
1.49 1 0.0%
 
8.16 1 0.0%
 
12.01 1 0.0%
 
10.0 1 0.0%
 
11.99 1 0.0%
 
Other values (23) 23 0.6%
 
(Missing) 3522 99.0%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-3.26 1 0.0%
 
-2.52 1 0.0%
 
0.97 1 0.0%
 
1.45 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.72 1 0.0%
 
13.38 1 0.0%
 
14.27 1 0.0%
 
16.26 2 0.1%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Numeric

Distinct count 1119
Unique (%) 31.4%
Missing (%) 50.2%
Missing (n) 1786
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.415
Minimum 5.03
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.03
5-th percentile 6.7
Q1 8.61
Median 12.88
Q3 19.6
95-th percentile 32.694
Maximum 52.26
Range 47.23
Interquartile range 10.99

Descriptive statistics

Standard deviation 8.7887
Coef of variation 0.57013
Kurtosis 3.0931
Mean 15.415
MAD 6.7561
Skewness 1.5803
Sum 27331
Variance 77.241
Memory size 27.9 KiB
Value Count Frequency (%)  
7.19 9 0.3%
 
7.29 8 0.2%
 
8.09 7 0.2%
 
6.81 7 0.2%
 
7.11 6 0.2%
 
8.4 6 0.2%
 
8.95 6 0.2%
 
7.22 6 0.2%
 
9.11 5 0.1%
 
7.71 5 0.1%
 
Other values (1108) 1708 48.0%
 
(Missing) 1786 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.03 1 0.0%
 
5.04 1 0.0%
 
5.1 1 0.0%
 
5.17 2 0.1%
 
5.18 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50.75 1 0.0%
 
51.0 1 0.0%
 
51.45 1 0.0%
 
51.87 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.99996

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 608
Unique (%) 17.1%
Missing (%) 50.1%
Missing (n) 1782
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.3421
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.74
Q1 3.15
Median 4.28
Q3 5.39
95-th percentile 7.18
Maximum 13.84
Range 12.73
Interquartile range 2.24

Descriptive statistics

Standard deviation 1.7025
Coef of variation 0.39208
Kurtosis 0.83756
Mean 4.3421
MAD 1.3453
Skewness 0.48864
Sum 7715.9
Variance 2.8984
Memory size 27.9 KiB
Value Count Frequency (%)  
4.55 10 0.3%
 
4.22 10 0.3%
 
5.18 10 0.3%
 
5.22 10 0.3%
 
4.43 9 0.3%
 
3.53 9 0.3%
 
1.76 9 0.3%
 
3.87 8 0.2%
 
4.69 8 0.2%
 
2.9 8 0.2%
 
Other values (597) 1686 47.4%
 
(Missing) 1782 50.1%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 2 0.1%
 
1.13 1 0.0%
 
1.14 1 0.0%
 
1.15 1 0.0%
 
1.16 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.15 1 0.0%
 
11.36 1 0.0%
 
12.46 1 0.0%
 
12.64 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99905

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.94144

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis

Correlation 0.99996

Value_SP500_EARNINGS_YIELD_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.96721

Value_SP500_INFLADJ_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_PBV_RATIO_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_PBV_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 1117
Unique (%) 31.4%
Missing (%) 50.2%
Missing (n) 1786
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.725
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 7.83
Q1 11.45
Median 14.73
Q3 18.04
95-th percentile 25.798
Maximum 123.73
Range 118.42
Interquartile range 6.59

Descriptive statistics

Standard deviation 8.4055
Coef of variation 0.53453
Kurtosis 67.915
Mean 15.725
MAD 4.6369
Skewness 6.4631
Sum 27880
Variance 70.653
Memory size 27.9 KiB
Value Count Frequency (%)  
15.61 6 0.2%
 
19.0 5 0.1%
 
13.82 5 0.1%
 
9.84 5 0.1%
 
7.97 5 0.1%
 
11.48 5 0.1%
 
12.21 5 0.1%
 
17.48 5 0.1%
 
17.83 5 0.1%
 
12.56 4 0.1%
 
Other values (1106) 1723 48.4%
 
(Missing) 1786 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.41 1 0.0%
 
5.74 1 0.0%
 
5.81 1 0.0%
 
5.82 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
101.87 1 0.0%
 
110.37 1 0.0%
 
119.85 1 0.0%
 
123.32 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 1

Value_SP500_PSR_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 1

Value_SP500_PSR_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 36
Unique (%) 1.0%
Missing (%) 99.0%
Missing (n) 3522
Infinite (%) 0.0%
Infinite (n) 0
Mean 12.684
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -32.784
Q1 0.3
Median 10.69
Q3 14.93
95-th percentile 54.266
Maximum 261.66
Range 341.14
Interquartile range 14.63

Descriptive statistics

Standard deviation 48.913
Coef of variation 3.8564
Kurtosis 19.396
Mean 12.684
MAD 21.87
Skewness 3.637
Sum 469.29
Variance 2392.5
Memory size 27.9 KiB
Value Count Frequency (%)  
8.86 2 0.1%
 
7.81 2 0.1%
 
-6.49 1 0.0%
 
15.9 1 0.0%
 
-11.6 1 0.0%
 
-79.48 1 0.0%
 
10.69 1 0.0%
 
11.38 1 0.0%
 
0.36 1 0.0%
 
-51.84 1 0.0%
 
Other values (25) 25 0.7%
 
(Missing) 3522 99.0%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 1 0.0%
 
-51.84 1 0.0%
 
-28.02 1 0.0%
 
-21.05 1 0.0%
 
-15.56 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
24.85 1 0.0%
 
36.07 1 0.0%
 
49.72 1 0.0%
 
72.45 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 1401
Unique (%) 39.4%
Missing (%) 50.2%
Missing (n) 1788
Infinite (%) 0.0%
Infinite (n) 0
Mean 262.57
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.4
Q1 7.745
Median 16.5
Q3 123.65
95-th percentile 1420.4
Maximum 2789.8
Range 2787.1
Interquartile range 115.91

Descriptive statistics

Standard deviation 523.67
Coef of variation 1.9944
Kurtosis 5.389
Mean 262.57
MAD 359.68
Skewness 2.4198
Sum 465010
Variance 274230
Memory size 27.9 KiB
Value Count Frequency (%)  
4.37 7 0.2%
 
4.46 7 0.2%
 
5.3 6 0.2%
 
7.68 6 0.2%
 
5.18 6 0.2%
 
5.32 6 0.2%
 
4.34 5 0.1%
 
4.38 5 0.1%
 
8.12 5 0.1%
 
4.65 5 0.1%
 
Other values (1390) 1713 48.1%
 
(Missing) 1788 50.2%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.85 1 0.0%
 
2.94 2 0.1%
 
3.05 1 0.0%
 
3.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2702.77 1 0.0%
 
2705.16 1 0.0%
 
2736.61 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99159

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99159

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.99714

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.9333

Value_SP500_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99183

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 22
Unique (%) 0.6%
Missing (%) 99.3%
Missing (n) 3534
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.2676
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -7.382
Q1 2.09
Median 5.68
Q3 7.68
95-th percentile 10.576
Maximum 10.93
Range 23.79
Interquartile range 5.59

Descriptive statistics

Standard deviation 5.775
Coef of variation 1.3532
Kurtosis 2.4996
Mean 4.2676
MAD 4.2259
Skewness -1.5198
Sum 106.69
Variance 33.351
Memory size 27.9 KiB
Value Count Frequency (%)  
2.09 2 0.1%
 
9.36 2 0.1%
 
7.68 2 0.1%
 
7.03 2 0.1%
 
1.7 1 0.0%
 
5.68 1 0.0%
 
5.37 1 0.0%
 
10.93 1 0.0%
 
4.16 1 0.0%
 
-1.18 1 0.0%
 
Other values (11) 11 0.3%
 
(Missing) 3534 99.3%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 1 0.0%
 
-8.45 1 0.0%
 
-3.11 1 0.0%
 
-1.18 1 0.0%
 
1.7 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.94 1 0.0%
 
9.03 1 0.0%
 
9.36 2 0.1%
 
10.88 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis

Correlation 1

Value_SP500_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.92834

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 NaN 11.10 NaN 9.01 89.81 NaN NaN NaN NaN NaN NaN NaN 11.1 NaN NaN 9.01 NaN 89.81 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-01-31 NaN 5.86 NaN NaN NaN NaN NaN 5.26 5.15 NaN NaN NaN NaN NaN 5.86 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.09 NaN 7.92 NaN NaN
1871-02-01 4.50 NaN 11.25 10.92 8.89 88.33 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1871-02-28 NaN 5.78 NaN NaN NaN NaN NaN 5.10 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.85 NaN NaN NaN NaN
1871-03-01 4.61 NaN 11.52 11.19 8.68 89.17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [20]:
def Column_missing_values(df):
    '''
    Function to find missing values or NaN values per column and print percentage of missing values per column
    in bar chart.
    IN - any pandas dataframe 
    '''
    #Percentage of missing values per column
    missing_columns = df.isnull().sum()
    if missing_columns.values.any() != 0:
        missing_columns = (missing_columns[missing_columns>0]/df.shape[0]) * 100
        missing_columns.sort_values(inplace=True)
        missing_columns.plot.bar(title = 'Column wise percentage missing values', figsize=(8,4))
    else:
        print('No missing values in provided dataframe')
        
    return
In [21]:
Column_missing_values(df)
In [22]:
#Correlation plot for numerical encoded version of API data in df.
sns.heatmap(df.corr(),annot=None,fmt='.2f',square=False)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x23b2a9a5ba8>
In [23]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df)
plt.show()
In [24]:
#Drop last rows with NaN values for Value_SP500_REAL_PRICE_MONTH because these were outlier with no price values.
df.drop(df.tail(11).index,axis=0,inplace=True)

Impute and Interpolate missing values

Original dataset has many missing or NaN values which is noise to end results. General methods like replacing with mean values can not be applied because it would add bias to the timeseries dataset.

In [25]:
import  impyute.imputation.cs 
import impyute.imputation.ts
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer, MissingIndicator
from statsmodels.tsa.arima_model import ARIMA
from sklearn.model_selection import cross_val_score
In [26]:
#Create an imputation object
imputer_most_frequent= SimpleImputer(missing_values=np.nan,strategy ='most_frequent')
#Inject imputed values in the dataset.
df_imputed = pd.DataFrame(imputer_most_frequent.fit_transform(df))
df_imputed.columns = df.columns
df_imputed.index = df.index
In [27]:
pandas_profiling.ProfileReport(df_imputed)
Out[27]:

Overview

Dataset info

Number of variables 36
Number of observations 3548
Total Missing (%) 0.0%
Total size in memory 998.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 25
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 10
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3548
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 1188
Unique (%) 33.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.952
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 9.26
Q1 11.34
Median 11.34
Q3 15.66
95-th percentile 25.41
Maximum 44.19
Range 39.41
Interquartile range 4.32

Descriptive statistics

Standard deviation 5.3909
Coef of variation 0.38639
Kurtosis 6.0424
Mean 13.952
MAD 3.9298
Skewness 2.1774
Sum 49501
Variance 29.061
Memory size 27.8 KiB
Value Count Frequency (%)  
11.34 1782 50.2%
 
17.82 6 0.2%
 
13.8 5 0.1%
 
16.6 5 0.1%
 
12.05 5 0.1%
 
16.83 4 0.1%
 
10.0 4 0.1%
 
10.91 4 0.1%
 
15.47 4 0.1%
 
18.96 4 0.1%
 
Other values (1178) 1725 48.6%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 
5.08 1 0.0%
 
5.12 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.22 1 0.0%
 
43.53 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Numeric

Distinct count 141
Unique (%) 4.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 12.108
Minimum 5.12
Maximum 43.77
Zeros (%) 0.0%

Quantile statistics

Minimum 5.12
5-th percentile 11.9
Q1 11.9
Median 11.9
Q3 11.9
95-th percentile 11.9
Maximum 43.77
Range 38.65
Interquartile range 0

Descriptive statistics

Standard deviation 1.7274
Coef of variation 0.14267
Kurtosis 112.09
Mean 12.108
MAD 0.45349
Skewness 9.3367
Sum 42958
Variance 2.9838
Memory size 27.8 KiB
Value Count Frequency (%)  
11.9 3402 95.9%
 
17.22 2 0.1%
 
9.26 2 0.1%
 
17.09 2 0.1%
 
18.47 2 0.1%
 
20.32 2 0.1%
 
13.9 2 0.1%
 
27.21 1 0.0%
 
22.9 1 0.0%
 
20.97 1 0.0%
 
Other values (131) 131 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
5.12 1 0.0%
 
5.99 1 0.0%
 
6.1 1 0.0%
 
6.29 1 0.0%
 
6.64 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
33.31 1 0.0%
 
33.36 1 0.0%
 
36.98 1 0.0%
 
40.57 1 0.0%
 
43.77 1 0.0%
 

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_QUARTER and should be ignored for analysis

Correlation 0.951

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.9145

Value_SP500_DIV_GROWTH_QUARTER
Numeric

Distinct count 113
Unique (%) 3.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.696
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile 11.89
Q1 11.89
Median 11.89
Q3 11.89
95-th percentile 11.89
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 1.6985
Coef of variation 0.14523
Kurtosis 144.27
Mean 11.696
MAD 0.4133
Skewness -10.499
Sum 41496
Variance 2.8849
Memory size 27.8 KiB
Value Count Frequency (%)  
11.89 3434 96.8%
 
12.65 2 0.1%
 
12.49 2 0.1%
 
10.46 1 0.0%
 
18.25 1 0.0%
 
5.05 1 0.0%
 
1.55 1 0.0%
 
-4.71 1 0.0%
 
1.26 1 0.0%
 
8.02 1 0.0%
 
Other values (103) 103 2.9%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-19.63 1 0.0%
 
-17.17 1 0.0%
 
-13.9 1 0.0%
 
-10.86 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
16.74 1 0.0%
 
17.4 1 0.0%
 
17.47 1 0.0%
 
17.51 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 32
Unique (%) 0.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -20.824
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile -21.07
Q1 -21.07
Median -21.07
Q3 -21.07
95-th percentile -21.07
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 2.6645
Coef of variation -0.12795
Kurtosis 125.17
Mean -20.824
MAD 0.48728
Skewness 11.107
Sum -73884
Variance 7.0994
Memory size 27.8 KiB
Value Count Frequency (%)  
-21.07 3517 99.1%
 
6.99 1 0.0%
 
3.07 1 0.0%
 
10.0 1 0.0%
 
1.49 1 0.0%
 
9.33 1 0.0%
 
12.01 1 0.0%
 
11.99 1 0.0%
 
0.97 1 0.0%
 
2.12 1 0.0%
 
Other values (22) 22 0.6%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 3517 99.1%
 
-3.26 1 0.0%
 
-2.52 1 0.0%
 
0.97 1 0.0%
 
1.45 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.72 1 0.0%
 
13.38 1 0.0%
 
14.27 1 0.0%
 
16.26 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Numeric

Distinct count 1118
Unique (%) 31.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.3
Minimum 5.03
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.03
5-th percentile 7.11
Q1 7.19
Median 7.19
Q3 12.843
95-th percentile 24.97
Maximum 52.26
Range 47.23
Interquartile range 5.6525

Descriptive statistics

Standard deviation 7.4502
Coef of variation 0.6593
Kurtosis 7.1196
Mean 11.3
MAD 5.3875
Skewness 2.448
Sum 40093
Variance 55.506
Memory size 27.8 KiB
Value Count Frequency (%)  
7.19 1784 50.3%
 
7.29 8 0.2%
 
6.81 7 0.2%
 
8.09 7 0.2%
 
8.4 6 0.2%
 
8.95 6 0.2%
 
7.22 6 0.2%
 
7.11 6 0.2%
 
6.94 5 0.1%
 
7.05 5 0.1%
 
Other values (1108) 1708 48.1%
 

Minimum 5 values

Value Count Frequency (%)  
5.03 1 0.0%
 
5.04 1 0.0%
 
5.1 1 0.0%
 
5.17 2 0.1%
 
5.18 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
50.75 1 0.0%
 
51.0 1 0.0%
 
51.45 1 0.0%
 
51.87 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YEAR
Numeric

Distinct count 146
Unique (%) 4.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.1088
Minimum 5.04
Maximum 52.26
Zeros (%) 0.0%

Quantile statistics

Minimum 5.04
5-th percentile 6.7
Q1 6.7
Median 6.7
Q3 6.7
95-th percentile 6.7
Maximum 52.26
Range 47.22
Interquartile range 0

Descriptive statistics

Standard deviation 2.8765
Coef of variation 0.40463
Kurtosis 118.18
Mean 7.1088
MAD 0.79031
Skewness 9.9101
Sum 25222
Variance 8.2741
Memory size 27.8 KiB
Value Count Frequency (%)  
6.7 3398 95.8%
 
7.11 2 0.1%
 
6.81 2 0.1%
 
7.61 2 0.1%
 
15.89 2 0.1%
 
12.17 2 0.1%
 
24.21 1 0.0%
 
9.18 1 0.0%
 
8.07 1 0.0%
 
16.97 1 0.0%
 
Other values (136) 136 3.8%
 

Minimum 5 values

Value Count Frequency (%)  
5.04 1 0.0%
 
5.15 1 0.0%
 
5.18 1 0.0%
 
5.2 1 0.0%
 
5.55 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
48.17 1 0.0%
 
50.0 1 0.0%
 
50.03 1 0.0%
 
51.02 1 0.0%
 
52.26 1 0.0%
 

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 607
Unique (%) 17.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.2812
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.96
Q1 4.22
Median 4.22
Q3 4.28
95-th percentile 6.57
Maximum 13.84
Range 12.73
Interquartile range 0.06

Descriptive statistics

Standard deviation 1.2062
Coef of variation 0.28175
Kurtosis 4.7742
Mean 4.2812
MAD 0.70379
Skewness 0.83882
Sum 15190
Variance 1.455
Memory size 27.8 KiB
Value Count Frequency (%)  
4.22 1781 50.2%
 
5.22 10 0.3%
 
4.55 10 0.3%
 
5.18 10 0.3%
 
4.43 9 0.3%
 
3.53 9 0.3%
 
1.76 9 0.3%
 
4.17 8 0.2%
 
2.93 8 0.2%
 
4.1 8 0.2%
 
Other values (597) 1686 47.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 2 0.1%
 
1.13 1 0.0%
 
1.14 1 0.0%
 
1.15 1 0.0%
 
1.16 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.15 1 0.0%
 
11.36 1 0.0%
 
12.46 1 0.0%
 
12.64 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Numeric

Distinct count 137
Unique (%) 3.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.0815
Minimum 1.17
Maximum 10.15
Zeros (%) 0.0%

Quantile statistics

Minimum 1.17
5-th percentile 4.07
Q1 4.07
Median 4.07
Q3 4.07
95-th percentile 4.07
Maximum 10.15
Range 8.98
Interquartile range 0

Descriptive statistics

Standard deviation 0.36379
Coef of variation 0.089132
Kurtosis 79.461
Mean 4.0815
MAD 0.069995
Skewness 4.2121
Sum 14481
Variance 0.13234
Memory size 27.8 KiB
Value Count Frequency (%)  
4.07 3400 95.8%
 
6.2 2 0.1%
 
5.41 2 0.1%
 
4.4 2 0.1%
 
3.49 2 0.1%
 
3.53 2 0.1%
 
3.81 2 0.1%
 
1.76 2 0.1%
 
1.61 2 0.1%
 
5.71 2 0.1%
 
Other values (127) 130 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.17 1 0.0%
 
1.22 1 0.0%
 
1.36 1 0.0%
 
1.37 1 0.0%
 
1.61 2 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
7.49 1 0.0%
 
8.11 1 0.0%
 
8.38 1 0.0%
 
9.72 1 0.0%
 
10.15 1 0.0%
 

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99834

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 0.9994

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.94595

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.94718

Value_SP500_EARNINGS_YIELD_MONTH
Numeric

Distinct count 792
Unique (%) 22.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.4929
Minimum 0.81
Maximum 18.82
Zeros (%) 0.0%

Quantile statistics

Minimum 0.81
5-th percentile 4.45
Q1 5.62
Median 5.62
Q3 6.79
95-th percentile 10.95
Maximum 18.82
Range 18.01
Interquartile range 1.17

Descriptive statistics

Standard deviation 2.1019
Coef of variation 0.32372
Kurtosis 5.3453
Mean 6.4929
MAD 1.4629
Skewness 2.0606
Sum 23037
Variance 4.418
Memory size 27.8 KiB
Value Count Frequency (%)  
5.62 1786 50.3%
 
5.72 10 0.3%
 
5.26 10 0.3%
 
5.3 9 0.3%
 
5.29 9 0.3%
 
5.61 8 0.2%
 
7.24 8 0.2%
 
5.53 8 0.2%
 
5.55 8 0.2%
 
5.54 8 0.2%
 
Other values (782) 1684 47.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.81 2 0.1%
 
0.83 1 0.0%
 
0.91 1 0.0%
 
0.98 1 0.0%
 
1.08 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.18 1 0.0%
 
17.21 1 0.0%
 
17.42 1 0.0%
 
18.48 1 0.0%
 
18.82 1 0.0%
 

Value_SP500_EARNINGS_YIELD_YEAR
Numeric

Distinct count 137
Unique (%) 3.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.4126
Minimum 1.41
Maximum 17.42
Zeros (%) 0.0%

Quantile statistics

Minimum 1.41
5-th percentile 5.33
Q1 5.33
Median 5.33
Q3 5.33
95-th percentile 5.33
Maximum 17.42
Range 16.01
Interquartile range 0

Descriptive statistics

Standard deviation 0.67946
Coef of variation 0.12553
Kurtosis 105.43
Mean 5.4126
MAD 0.17808
Skewness 9.0727
Sum 19204
Variance 0.46167
Memory size 27.8 KiB
Value Count Frequency (%)  
5.33 3401 95.9%
 
5.53 3 0.1%
 
8.46 3 0.1%
 
5.55 2 0.1%
 
6.57 2 0.1%
 
7.42 2 0.1%
 
4.44 2 0.1%
 
7.35 2 0.1%
 
11.08 2 0.1%
 
9.87 2 0.1%
 
Other values (127) 127 3.6%
 

Minimum 5 values

Value Count Frequency (%)  
1.41 1 0.0%
 
2.17 1 0.0%
 
3.04 1 0.0%
 
3.18 1 0.0%
 
3.44 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
13.53 1 0.0%
 
13.84 1 0.0%
 
15.1 1 0.0%
 
15.77 1 0.0%
 
17.42 1 0.0%
 

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.95138

Value_SP500_INFLADJ_YEAR
Numeric

Distinct count 150
Unique (%) 4.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 101.53
Minimum 81.79
Maximum 2914.2
Zeros (%) 0.0%

Quantile statistics

Minimum 81.79
5-th percentile 81.79
Q1 81.79
Median 81.79
Q3 81.79
95-th percentile 81.79
Maximum 2914.2
Range 2832.4
Interquartile range 0

Descriptive statistics

Standard deviation 157.9
Coef of variation 1.5552
Kurtosis 149.94
Mean 101.53
MAD 37.848
Skewness 11.449
Sum 360230
Variance 24932
Memory size 27.8 KiB
Value Count Frequency (%)  
81.79 3399 95.8%
 
178.12 1 0.0%
 
103.6 1 0.0%
 
739.63 1 0.0%
 
609.48 1 0.0%
 
188.58 1 0.0%
 
178.98 1 0.0%
 
176.02 1 0.0%
 
2041.06 1 0.0%
 
161.59 1 0.0%
 
Other values (140) 140 3.9%
 

Minimum 5 values

Value Count Frequency (%)  
81.79 3399 95.8%
 
88.75 1 0.0%
 
89.81 1 0.0%
 
94.32 1 0.0%
 
94.95 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2187.26 1 0.0%
 
2361.3 1 0.0%
 
2758.77 1 0.0%
 
2836.75 1 0.0%
 
2914.22 1 0.0%
 

Value_SP500_PBV_RATIO_QUARTER
Numeric

Distinct count 61
Unique (%) 1.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.5849
Minimum 1.78
Maximum 5.06
Zeros (%) 0.0%

Quantile statistics

Minimum 1.78
5-th percentile 2.58
Q1 2.58
Median 2.58
Q3 2.58
95-th percentile 2.58
Maximum 5.06
Range 3.28
Interquartile range 0

Descriptive statistics

Standard deviation 0.10134
Coef of variation 0.039203
Kurtosis 309.88
Mean 2.5849
MAD 0.015266
Skewness 14.136
Sum 9171.1
Variance 0.010269
Memory size 27.8 KiB
Value Count Frequency (%)  
2.58 3473 97.9%
 
2.91 3 0.1%
 
2.76 3 0.1%
 
2.77 3 0.1%
 
2.43 2 0.1%
 
3.03 2 0.1%
 
2.83 2 0.1%
 
2.67 2 0.1%
 
2.78 2 0.1%
 
2.74 2 0.1%
 
Other values (51) 54 1.5%
 

Minimum 5 values

Value Count Frequency (%)  
1.78 1 0.0%
 
1.85 1 0.0%
 
1.89 1 0.0%
 
1.9 1 0.0%
 
2.0 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.05 1 0.0%
 
4.49 1 0.0%
 
4.65 1 0.0%
 
5.05 1 0.0%
 
5.06 1 0.0%
 

Value_SP500_PBV_RATIO_YEAR
Numeric

Distinct count 19
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.1744
Minimum 2
Maximum 5.05
Zeros (%) 0.0%

Quantile statistics

Minimum 2
5-th percentile 2.17
Q1 2.17
Median 2.17
Q3 2.17
95-th percentile 2.17
Maximum 5.05
Range 3.05
Interquartile range 0

Descriptive statistics

Standard deviation 0.078208
Coef of variation 0.035968
Kurtosis 682.74
Mean 2.1744
MAD 0.0089232
Skewness 23.544
Sum 7714.7
Variance 0.0061165
Memory size 27.8 KiB
Value Count Frequency (%)  
2.17 3529 99.5%
 
2.76 2 0.1%
 
2.14 1 0.0%
 
2.91 1 0.0%
 
3.5 1 0.0%
 
2.73 1 0.0%
 
2.77 1 0.0%
 
2.81 1 0.0%
 
2.58 1 0.0%
 
3.39 1 0.0%
 
Other values (9) 9 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.0 1 0.0%
 
2.05 1 0.0%
 
2.14 1 0.0%
 
2.17 3529 99.5%
 
2.58 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.3 1 0.0%
 
3.39 1 0.0%
 
3.5 1 0.0%
 
4.05 1 0.0%
 
5.05 1 0.0%
 

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 1116
Unique (%) 31.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.667
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 9.13
Q1 14.73
Median 15.61
Q3 15.61
95-th percentile 22.479
Maximum 123.73
Range 118.42
Interquartile range 0.88

Descriptive statistics

Standard deviation 5.9414
Coef of variation 0.37922
Kurtosis 139.05
Mean 15.667
MAD 2.3416
Skewness 9.1666
Sum 55588
Variance 35.3
Memory size 27.8 KiB
Value Count Frequency (%)  
15.61 1781 50.2%
 
11.48 5 0.1%
 
12.21 5 0.1%
 
17.48 5 0.1%
 
9.84 5 0.1%
 
19.0 5 0.1%
 
17.83 5 0.1%
 
7.97 5 0.1%
 
13.82 5 0.1%
 
9.13 4 0.1%
 
Other values (1106) 1723 48.6%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.41 1 0.0%
 
5.74 1 0.0%
 
5.81 1 0.0%
 
5.82 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
101.87 1 0.0%
 
110.37 1 0.0%
 
119.85 1 0.0%
 
123.32 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Numeric

Distinct count 142
Unique (%) 4.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.989
Minimum 5.74
Maximum 70.91
Zeros (%) 0.0%

Quantile statistics

Minimum 5.74
5-th percentile 11.82
Q1 11.82
Median 11.82
Q3 11.82
95-th percentile 11.82
Maximum 70.91
Range 65.17
Interquartile range 0

Descriptive statistics

Standard deviation 1.6921
Coef of variation 0.14114
Kurtosis 487.59
Mean 11.989
MAD 0.37918
Skewness 17.568
Sum 42537
Variance 2.8634
Memory size 27.8 KiB
Value Count Frequency (%)  
11.82 3401 95.9%
 
13.48 2 0.1%
 
18.01 2 0.1%
 
7.97 2 0.1%
 
9.02 2 0.1%
 
10.13 2 0.1%
 
18.77 2 0.1%
 
17.81 1 0.0%
 
19.33 1 0.0%
 
19.99 1 0.0%
 
Other values (132) 132 3.7%
 

Minimum 5 values

Value Count Frequency (%)  
5.74 1 0.0%
 
6.34 1 0.0%
 
6.62 1 0.0%
 
7.22 1 0.0%
 
7.39 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
29.04 1 0.0%
 
31.43 1 0.0%
 
32.92 1 0.0%
 
46.17 1 0.0%
 
70.91 1 0.0%
 

Value_SP500_PSR_QUARTER
Numeric

Distinct count 55
Unique (%) 1.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.432
Minimum 0.8
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.8
5-th percentile 1.43
Q1 1.43
Median 1.43
Q3 1.43
95-th percentile 1.43
Maximum 2.31
Range 1.51
Interquartile range 0

Descriptive statistics

Standard deviation 0.048445
Coef of variation 0.03383
Kurtosis 154.51
Mean 1.432
MAD 0.0071731
Skewness 8.0394
Sum 5080.8
Variance 0.002347
Memory size 27.8 KiB
Value Count Frequency (%)  
1.43 3479 98.1%
 
1.44 3 0.1%
 
1.54 3 0.1%
 
1.31 3 0.1%
 
1.19 2 0.1%
 
1.77 2 0.1%
 
1.52 2 0.1%
 
1.27 2 0.1%
 
2.1 2 0.1%
 
1.66 2 0.1%
 
Other values (45) 48 1.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.8 1 0.0%
 
0.87 1 0.0%
 
0.97 1 0.0%
 
1.08 1 0.0%
 
1.1 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.1 2 0.1%
 
2.13 1 0.0%
 
2.17 1 0.0%
 
2.25 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_PSR_YEAR
Numeric

Distinct count 16
Unique (%) 0.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.3116
Minimum 0.87
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.87
5-th percentile 1.31
Q1 1.31
Median 1.31
Q3 1.31
95-th percentile 1.31
Maximum 2.31
Range 1.44
Interquartile range 0

Descriptive statistics

Standard deviation 0.033841
Coef of variation 0.025802
Kurtosis 506.53
Mean 1.3116
MAD 0.003505
Skewness 20.076
Sum 4653.5
Variance 0.0011452
Memory size 27.8 KiB
Value Count Frequency (%)  
1.31 3530 99.5%
 
1.56 2 0.1%
 
1.43 2 0.1%
 
1.77 2 0.1%
 
1.66 1 0.0%
 
0.87 1 0.0%
 
1.81 1 0.0%
 
2.13 1 0.0%
 
1.3 1 0.0%
 
1.95 1 0.0%
 
Other values (6) 6 0.2%
 

Minimum 5 values

Value Count Frequency (%)  
0.87 1 0.0%
 
1.19 1 0.0%
 
1.23 1 0.0%
 
1.3 1 0.0%
 
1.31 3530 99.5%
 

Maximum 5 values

Value Count Frequency (%)  
1.81 1 0.0%
 
1.95 1 0.0%
 
2.13 1 0.0%
 
2.17 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric

Distinct count 114
Unique (%) 3.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.791
Minimum -90.27
Maximum 905.56
Zeros (%) 0.0%

Quantile statistics

Minimum -90.27
5-th percentile 11.38
Q1 11.38
Median 11.38
Q3 11.38
95-th percentile 11.38
Maximum 905.56
Range 995.83
Interquartile range 0

Descriptive statistics

Standard deviation 23.282
Coef of variation 1.9746
Kurtosis 1188.3
Mean 11.791
MAD 1.7031
Skewness 32.842
Sum 41833
Variance 542.04
Memory size 27.8 KiB
Value Count Frequency (%)  
11.38 3435 96.8%
 
17.8 1 0.0%
 
-7.72 1 0.0%
 
-31.14 1 0.0%
 
0.55 1 0.0%
 
-44.15 1 0.0%
 
11.18 1 0.0%
 
-21.05 1 0.0%
 
-86.8 1 0.0%
 
15.28 1 0.0%
 
Other values (104) 104 2.9%
 

Minimum 5 values

Value Count Frequency (%)  
-90.27 1 0.0%
 
-86.8 1 0.0%
 
-79.48 1 0.0%
 
-73.86 1 0.0%
 
-51.84 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
72.45 1 0.0%
 
261.66 1 0.0%
 
492.53 1 0.0%
 
870.4 1 0.0%
 
905.56 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 33
Unique (%) 0.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -78.62
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -79.48
Q1 -79.48
Median -79.48
Q3 -79.48
95-th percentile -79.48
Maximum 261.66
Range 341.14
Interquartile range 0

Descriptive statistics

Standard deviation 10.149
Coef of variation -0.12909
Kurtosis 419.42
Mean -78.62
MAD 1.7038
Skewness 17.108
Sum -278940
Variance 103.01
Memory size 27.8 KiB
Value Count Frequency (%)  
-79.48 3516 99.1%
 
17.8 1 0.0%
 
8.1 1 0.0%
 
261.66 1 0.0%
 
-28.02 1 0.0%
 
49.72 1 0.0%
 
0.36 1 0.0%
 
-6.49 1 0.0%
 
11.38 1 0.0%
 
10.69 1 0.0%
 
Other values (23) 23 0.6%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 3516 99.1%
 
-51.84 1 0.0%
 
-28.02 1 0.0%
 
-21.05 1 0.0%
 
-15.56 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
24.85 1 0.0%
 
36.07 1 0.0%
 
49.72 1 0.0%
 
72.45 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 1400
Unique (%) 39.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 133.25
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.37
Q1 4.37
Median 4.37
Q3 16.293
95-th percentile 1149.6
Maximum 2789.8
Range 2787.1
Interquartile range 11.922

Descriptive statistics

Standard deviation 391.81
Coef of variation 2.9404
Kurtosis 14.745
Mean 133.25
MAD 208.57
Skewness 3.7646
Sum 472780
Variance 153520
Memory size 27.8 KiB
Value Count Frequency (%)  
4.37 1784 50.3%
 
4.46 7 0.2%
 
5.32 6 0.2%
 
5.3 6 0.2%
 
5.18 6 0.2%
 
7.68 6 0.2%
 
4.59 5 0.1%
 
5.51 5 0.1%
 
4.65 5 0.1%
 
8.12 5 0.1%
 
Other values (1390) 1713 48.3%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.85 1 0.0%
 
2.94 2 0.1%
 
3.05 1 0.0%
 
3.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2702.77 1 0.0%
 
2705.16 1 0.0%
 
2736.61 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Numeric

Distinct count 66
Unique (%) 1.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.7438
Minimum -15.81
Maximum 8.89
Zeros (%) 0.0%

Quantile statistics

Minimum -15.81
5-th percentile -1.8
Q1 -1.8
Median -1.8
Q3 -1.8
95-th percentile -1.8
Maximum 8.89
Range 24.7
Interquartile range 0

Descriptive statistics

Standard deviation 0.86472
Coef of variation -0.49588
Kurtosis 96.112
Mean -1.7438
MAD 0.15904
Skewness 3.3383
Sum -6187
Variance 0.74774
Memory size 27.8 KiB
Value Count Frequency (%)  
-1.8 3483 98.2%
 
7.95 1 0.0%
 
7.72 1 0.0%
 
-6.85 1 0.0%
 
5.72 1 0.0%
 
7.23 1 0.0%
 
1.13 1 0.0%
 
2.88 1 0.0%
 
4.39 1 0.0%
 
5.09 1 0.0%
 
Other values (56) 56 1.6%
 

Minimum 5 values

Value Count Frequency (%)  
-15.81 1 0.0%
 
-12.69 1 0.0%
 
-12.66 1 0.0%
 
-9.91 1 0.0%
 
-8.35 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.53 1 0.0%
 
7.72 1 0.0%
 
7.95 1 0.0%
 
8.87 1 0.0%
 
8.89 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99905

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.98094

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.91447

Value_SP500_SALES_GROWTH_QUARTER
Numeric

Distinct count 67
Unique (%) 1.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -16.087
Minimum -16.46
Maximum 12.56
Zeros (%) 0.0%

Quantile statistics

Minimum -16.46
5-th percentile -16.46
Q1 -16.46
Median -16.46
Q3 -16.46
95-th percentile -16.46
Maximum 12.56
Range 29.02
Interquartile range 0

Descriptive statistics

Standard deviation 2.8312
Coef of variation -0.176
Kurtosis 61.813
Mean -16.087
MAD 0.73302
Skewness 7.8448
Sum -57075
Variance 8.0156
Memory size 27.8 KiB
Value Count Frequency (%)  
-16.46 3482 98.1%
 
-2.47 1 0.0%
 
1.7 1 0.0%
 
0.2 1 0.0%
 
-8.45 1 0.0%
 
8.94 1 0.0%
 
7.62 1 0.0%
 
9.96 1 0.0%
 
8.04 1 0.0%
 
4.81 1 0.0%
 
Other values (57) 57 1.6%
 

Minimum 5 values

Value Count Frequency (%)  
-16.46 3482 98.1%
 
-12.86 1 0.0%
 
-11.95 1 0.0%
 
-8.45 1 0.0%
 
-7.97 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.93 1 0.0%
 
11.01 1 0.0%
 
11.29 1 0.0%
 
11.99 1 0.0%
 
12.56 1 0.0%
 

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 21
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -12.761
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -12.86
Q1 -12.86
Median -12.86
Q3 -12.86
95-th percentile -12.86
Maximum 10.93
Range 23.79
Interquartile range 0

Descriptive statistics

Standard deviation 1.3605
Coef of variation -0.10661
Kurtosis 208.16
Mean -12.761
MAD 0.19651
Skewness 14.282
Sum -45277
Variance 1.8508
Memory size 27.8 KiB
Value Count Frequency (%)  
-12.86 3528 99.4%
 
2.24 1 0.0%
 
5.98 1 0.0%
 
5.37 1 0.0%
 
10.93 1 0.0%
 
4.16 1 0.0%
 
-1.18 1 0.0%
 
7.03 1 0.0%
 
1.7 1 0.0%
 
3.76 1 0.0%
 
Other values (11) 11 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 3528 99.4%
 
-8.45 1 0.0%
 
-3.11 1 0.0%
 
-1.18 1 0.0%
 
1.7 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.94 1 0.0%
 
9.03 1 0.0%
 
9.36 1 0.0%
 
10.88 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Numeric

Distinct count 71
Unique (%) 2.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 680.53
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 674.59
Q1 674.59
Median 674.59
Q3 674.59
95-th percentile 674.59
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 48.068
Coef of variation 0.070634
Kurtosis 79.358
Mean 680.53
MAD 11.646
Skewness 8.7691
Sum 2414500
Variance 2310.6
Memory size 27.8 KiB
Value Count Frequency (%)  
674.59 3478 98.0%
 
1127.13 1 0.0%
 
917.93 1 0.0%
 
998.54 1 0.0%
 
1185.81 1 0.0%
 
828.1 1 0.0%
 
1136.16 1 0.0%
 
738.81 1 0.0%
 
965.19 1 0.0%
 
981.21 1 0.0%
 
Other values (61) 61 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 3478 98.0%
 
678.6 1 0.0%
 
684.42 1 0.0%
 
697.75 1 0.0%
 
697.9 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1185.81 1 0.0%
 
1203.1 1 0.0%
 
1231.57 1 0.0%
 
1259.18 1 0.0%
 
1292.84 1 0.0%
 

Value_SP500_SALES_YEAR
Numeric

Distinct count 22
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 676.69
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 674.59
Q1 674.59
Median 674.59
Q3 674.59
95-th percentile 674.59
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 30.328
Coef of variation 0.044818
Kurtosis 255.59
Mean 676.69
MAD 4.169
Skewness 15.665
Sum 2400900
Variance 919.76
Memory size 27.8 KiB
Value Count Frequency (%)  
674.59 3527 99.4%
 
1127.13 1 0.0%
 
1092.37 1 0.0%
 
1203.1 1 0.0%
 
1292.84 1 0.0%
 
1169.42 1 0.0%
 
1163.32 1 0.0%
 
1042.46 1 0.0%
 
874.32 1 0.0%
 
710.81 1 0.0%
 
Other values (12) 12 0.3%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 3527 99.4%
 
710.81 1 0.0%
 
736.88 1 0.0%
 
745.7 1 0.0%
 
788.17 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1169.42 1 0.0%
 
1203.1 1 0.0%
 
1231.57 1 0.0%
 
1259.18 1 0.0%
 
1292.84 1 0.0%
 

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.44 4.22 11.10 11.34 9.01 89.81 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.10 4.07 1.31 9.01 2.17 89.81 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
1871-01-31 4.37 5.86 15.61 11.34 5.62 99.41 1.43 5.26 5.15 -21.07 11.89 2.58 11.9 11.82 5.86 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.09 290.68 7.92 -77.52 290.68
1871-02-01 4.50 4.22 11.25 10.92 8.89 88.33 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
1871-02-28 4.37 5.78 15.61 11.34 5.62 99.41 1.43 5.10 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 7.85 290.68 7.95 -77.52 290.68
1871-03-01 4.61 4.22 11.52 11.19 8.68 89.17 1.43 7.19 6.70 -21.07 11.89 2.58 11.9 11.82 4.07 1.31 5.33 2.17 81.79 674.59 -12.86 674.59 -1.8 -16.46 -12.69 -79.48 943.84 11.38 17.75 942.41 8.83 290.68 7.95 -77.52 290.68
In [28]:
#Apply linear interpolation on the dataset.
df_interpolate = df.interpolate(method='linear',axis=0,inplace=False,limit_direction='both')
In [29]:
print(df_imputed.shape)
print(df_interpolate.shape)
(3548, 35)
(3548, 35)
In [30]:
#Full profile on interpolated dataset.
pandas_profiling.ProfileReport(df_interpolate)
Out[30]:

Overview

Dataset info

Number of variables 36
Number of observations 3548
Total Missing (%) 0.0%
Total size in memory 998.0 KiB
Average record size in memory 288.0 B

Variables types

Numeric 13
Categorical 0
Boolean 0
Date 1
Text (Unique) 0
Rejected 22
Unsupported 0

Warnings

Variables

Date
Date

Distinct count 3548
Unique (%) 100.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Minimum 1871-01-01 00:00:00
Maximum 2018-12-31 00:00:00

Value_SHILLER_PE_RATIO_MONTH
Numeric

Distinct count 2354
Unique (%) 66.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.578
Minimum 4.78
Maximum 44.19
Zeros (%) 0.0%

Quantile statistics

Minimum 4.78
5-th percentile 7.9768
Q1 11.62
Median 15.655
Q3 20.152
95-th percentile 28.102
Maximum 44.19
Range 39.41
Interquartile range 8.5325

Descriptive statistics

Standard deviation 6.676
Coef of variation 0.4027
Kurtosis 1.9619
Mean 16.578
MAD 5.1107
Skewness 1.1209
Sum 58820
Variance 44.57
Memory size 27.8 KiB
Value Count Frequency (%)  
13.8 7 0.2%
 
11.34 7 0.2%
 
17.65 7 0.2%
 
15.23 7 0.2%
 
17.82 7 0.2%
 
12.43 6 0.2%
 
10.91 6 0.2%
 
11.64 6 0.2%
 
15.27 6 0.2%
 
18.07 6 0.2%
 
Other values (2344) 3483 98.2%
 

Minimum 5 values

Value Count Frequency (%)  
4.78 1 0.0%
 
4.95 1 0.0%
 
4.955 1 0.0%
 
5.02 1 0.0%
 
5.04 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
43.7 1 0.0%
 
43.77 1 0.0%
 
43.83 1 0.0%
 
43.980000000000004 1 0.0%
 
44.19 1 0.0%
 

Value_SHILLER_PE_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SHILLER_PE_RATIO_MONTH and should be ignored for analysis

Correlation 0.98989

Value_SP500_BVPS_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis

Correlation 0.99959

Value_SP500_BVPS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.97772

Value_SP500_DIV_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99279

Value_SP500_DIV_GROWTH_YEAR
Numeric

Distinct count 689
Unique (%) 19.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 11.924
Minimum -21.07
Maximum 18.25
Zeros (%) 0.0%

Quantile statistics

Minimum -21.07
5-th percentile 2.3661
Q1 13.38
Median 13.38
Q3 13.38
95-th percentile 13.38
Maximum 18.25
Range 39.32
Interquartile range 0

Descriptive statistics

Standard deviation 4.2003
Coef of variation 0.35224
Kurtosis 15.155
Mean 11.924
MAD 2.5096
Skewness -3.5077
Sum 42308
Variance 17.643
Memory size 27.8 KiB
Value Count Frequency (%)  
13.38 2856 80.5%
 
8.65 4 0.1%
 
10.68 2 0.1%
 
13.4225 1 0.0%
 
4.31625 1 0.0%
 
16.26 1 0.0%
 
11.246666666666666 1 0.0%
 
13.151666666666667 1 0.0%
 
17.255000000000003 1 0.0%
 
12.135 1 0.0%
 
Other values (679) 679 19.1%
 

Minimum 5 values

Value Count Frequency (%)  
-21.07 1 0.0%
 
-20.131666666666668 1 0.0%
 
-20.09375 1 0.0%
 
-19.193333333333335 1 0.0%
 
-19.1175 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.989166666666666 1 0.0%
 
18.00125 1 0.0%
 
18.08416666666667 1 0.0%
 
18.167083333333334 1 0.0%
 
18.25 1 0.0%
 

Value_SP500_DIV_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis

Correlation 0.91991

Value_SP500_DIV_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis

Correlation 0.99956

Value_SP500_DIV_YIELD_MONTH
Numeric

Distinct count 1295
Unique (%) 36.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.3469
Minimum 1.11
Maximum 13.84
Zeros (%) 0.0%

Quantile statistics

Minimum 1.11
5-th percentile 1.74
Q1 3.1588
Median 4.285
Q3 5.39
95-th percentile 7.18
Maximum 13.84
Range 12.73
Interquartile range 2.2312

Descriptive statistics

Standard deviation 1.6983
Coef of variation 0.39069
Kurtosis 0.77793
Mean 4.3469
MAD 1.3423
Skewness 0.47674
Sum 15423
Variance 2.8842
Memory size 27.8 KiB
Value Count Frequency (%)  
5.18 15 0.4%
 
5.01 13 0.4%
 
1.94 12 0.3%
 
1.76 12 0.3%
 
5.22 12 0.3%
 
3.7 11 0.3%
 
4.22 11 0.3%
 
4.92 10 0.3%
 
2.96 10 0.3%
 
3.49 10 0.3%
 
Other values (1285) 3432 96.7%
 

Minimum 5 values

Value Count Frequency (%)  
1.11 3 0.1%
 
1.12 1 0.0%
 
1.13 1 0.0%
 
1.1349999999999998 1 0.0%
 
1.14 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
12.46 1 0.0%
 
12.64 1 0.0%
 
13.15 1 0.0%
 
13.24 1 0.0%
 
13.84 1 0.0%
 

Value_SP500_DIV_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis

Correlation 0.97677

Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99925

Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis

Correlation 0.99856

Value_SP500_EARNINGS_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_INFLADJ_YEAR and should be ignored for analysis

Correlation 0.9169

Value_SP500_EARNINGS_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis

Correlation 0.99519

Value_SP500_EARNINGS_YIELD_MONTH
Numeric

Distinct count 1649
Unique (%) 46.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.3645
Minimum 0.81
Maximum 18.82
Zeros (%) 0.0%

Quantile statistics

Minimum 0.81
5-th percentile 3.8938
Q1 5.55
Median 6.78
Q3 8.755
95-th percentile 12.783
Maximum 18.82
Range 18.01
Interquartile range 3.205

Descriptive statistics

Standard deviation 2.7021
Coef of variation 0.36691
Kurtosis 1.1874
Mean 7.3645
MAD 2.0858
Skewness 0.96555
Sum 26129
Variance 7.3012
Memory size 27.8 KiB
Value Count Frequency (%)  
5.62 15 0.4%
 
5.29 13 0.4%
 
5.54 13 0.4%
 
5.33 12 0.3%
 
5.72 11 0.3%
 
6.23 11 0.3%
 
5.3 11 0.3%
 
5.38 11 0.3%
 
5.48 11 0.3%
 
5.26 11 0.3%
 
Other values (1639) 3429 96.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.81 3 0.1%
 
0.8200000000000001 1 0.0%
 
0.83 1 0.0%
 
0.87 1 0.0%
 
0.895 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
17.845 1 0.0%
 
18.12 1 0.0%
 
18.48 1 0.0%
 
18.65 1 0.0%
 
18.82 1 0.0%
 

Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis

Correlation 0.98185

Value_SP500_INFLADJ_MONTH
Highly correlated

This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis

Correlation 0.9681

Value_SP500_INFLADJ_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis

Correlation 0.92035

Value_SP500_PBV_RATIO_QUARTER
Numeric

Distinct count 333
Unique (%) 9.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.7592
Minimum 1.78
Maximum 5.06
Zeros (%) 0.0%

Quantile statistics

Minimum 1.78
5-th percentile 2.6767
Q1 5.05
Median 5.05
Q3 5.05
95-th percentile 5.05
Maximum 5.06
Range 3.28
Interquartile range 0

Descriptive statistics

Standard deviation 0.79199
Coef of variation 0.16641
Kurtosis 4.4417
Mean 4.7592
MAD 0.50877
Skewness -2.475
Sum 16886
Variance 0.62724
Memory size 27.8 KiB
Value Count Frequency (%)  
5.05 3096 87.3%
 
2.76 9 0.3%
 
2.91 8 0.2%
 
2.74 7 0.2%
 
2.73 5 0.1%
 
2.19 4 0.1%
 
2.63 4 0.1%
 
2.58 4 0.1%
 
2.81 4 0.1%
 
2.763333333333333 4 0.1%
 
Other values (323) 403 11.4%
 

Minimum 5 values

Value Count Frequency (%)  
1.78 1 0.0%
 
1.7983333333333333 1 0.0%
 
1.8166666666666667 2 0.1%
 
1.835 1 0.0%
 
1.85 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5.053333333333333 1 0.0%
 
5.055 1 0.0%
 
5.056666666666667 1 0.0%
 
5.058333333333333 1 0.0%
 
5.06 1 0.0%
 

Value_SP500_PBV_RATIO_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis

Correlation 0.99857

Value_SP500_PE_RATIO_MONTH
Numeric

Distinct count 2218
Unique (%) 62.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.731
Minimum 5.31
Maximum 123.73
Zeros (%) 0.0%

Quantile statistics

Minimum 5.31
5-th percentile 7.8202
Q1 11.429
Median 14.75
Q3 18.03
95-th percentile 25.691
Maximum 123.73
Range 118.42
Interquartile range 6.6013

Descriptive statistics

Standard deviation 8.3823
Coef of variation 0.53285
Kurtosis 67.083
Mean 15.731
MAD 4.6349
Skewness 6.4167
Sum 55814
Variance 70.263
Memory size 27.8 KiB
Value Count Frequency (%)  
15.61 8 0.2%
 
12.21 7 0.2%
 
9.84 7 0.2%
 
10.34 7 0.2%
 
14.51 6 0.2%
 
11.48 6 0.2%
 
14.75 6 0.2%
 
10.98 6 0.2%
 
19.0 6 0.2%
 
15.11 5 0.1%
 
Other values (2208) 3484 98.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.31 1 0.0%
 
5.359999999999999 1 0.0%
 
5.41 1 0.0%
 
5.525 1 0.0%
 
5.609999999999999 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
119.85 1 0.0%
 
121.78999999999999 1 0.0%
 
123.32 1 0.0%
 
123.525 1 0.0%
 
123.73 1 0.0%
 

Value_SP500_PE_RATIO_YEAR
Numeric

Distinct count 3346
Unique (%) 94.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.741
Minimum 5.74
Maximum 70.91
Zeros (%) 0.0%

Quantile statistics

Minimum 5.74
5-th percentile 8.0235
Q1 11.653
Median 15.053
Q3 18.011
95-th percentile 25.639
Maximum 70.91
Range 65.17
Interquartile range 6.3581

Descriptive statistics

Standard deviation 6.6443
Coef of variation 0.42211
Kurtosis 14.325
Mean 15.741
MAD 4.3673
Skewness 2.7959
Sum 55848
Variance 44.147
Memory size 27.8 KiB
Value Count Frequency (%)  
18.08 3 0.1%
 
17.496666666666666 3 0.1%
 
15.59 3 0.1%
 
11.82 3 0.1%
 
14.835 3 0.1%
 
17.45 3 0.1%
 
16.85 3 0.1%
 
17.21666666666667 3 0.1%
 
16.3 3 0.1%
 
12.95125 3 0.1%
 
Other values (3336) 3518 99.2%
 

Minimum 5 values

Value Count Frequency (%)  
5.74 1 0.0%
 
5.765000000000001 1 0.0%
 
5.79 1 0.0%
 
5.815 1 0.0%
 
5.832916666666667 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
66.72583333333333 1 0.0%
 
66.78916666666666 1 0.0%
 
68.81791666666666 1 0.0%
 
68.84958333333333 1 0.0%
 
70.91 1 0.0%
 

Value_SP500_PSR_QUARTER
Numeric

Distinct count 300
Unique (%) 8.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.738
Minimum 0.8
Maximum 2.31
Zeros (%) 0.0%

Quantile statistics

Minimum 0.8
5-th percentile 1.4339
Q1 1.77
Median 1.77
Q3 1.77
95-th percentile 1.77
Maximum 2.31
Range 1.51
Interquartile range 0

Descriptive statistics

Standard deviation 0.13545
Coef of variation 0.077933
Kurtosis 13.779
Mean 1.738
MAD 0.067544
Skewness -3.163
Sum 6166.4
Variance 0.018346
Memory size 27.8 KiB
Value Count Frequency (%)  
1.77 3122 88.0%
 
1.44 10 0.3%
 
1.66 7 0.2%
 
1.52 7 0.2%
 
2.1 7 0.2%
 
1.43 6 0.2%
 
1.46 6 0.2%
 
1.3166666666666667 5 0.1%
 
1.47 4 0.1%
 
1.33 4 0.1%
 
Other values (290) 370 10.4%
 

Minimum 5 values

Value Count Frequency (%)  
0.8 1 0.0%
 
0.8116666666666668 1 0.0%
 
0.8233333333333334 1 0.0%
 
0.8283333333333334 1 0.0%
 
0.835 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.19 1 0.0%
 
2.205 1 0.0%
 
2.25 1 0.0%
 
2.2575000000000003 1 0.0%
 
2.31 1 0.0%
 

Value_SP500_PSR_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_PSR_QUARTER and should be ignored for analysis

Correlation 0.98853

Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric

Distinct count 675
Unique (%) 19.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.6836
Minimum -90.27
Maximum 905.56
Zeros (%) 0.0%

Quantile statistics

Minimum -90.27
5-th percentile -7.94
Q1 -7.94
Median -7.94
Q3 -7.94
95-th percentile 16.01
Maximum 905.56
Range 995.83
Interquartile range 0

Descriptive statistics

Standard deviation 56.051
Coef of variation -33.292
Kurtosis 184.12
Mean -1.6836
MAD 13.005
Skewness 12.982
Sum -5973.4
Variance 3141.7
Memory size 27.8 KiB
Value Count Frequency (%)  
-7.94 2856 80.5%
 
14.93 9 0.3%
 
11.38 7 0.2%
 
-0.54 2 0.1%
 
-12.535 2 0.1%
 
-13.59 2 0.1%
 
14.05 2 0.1%
 
13.870000000000001 1 0.0%
 
-14.786666666666667 1 0.0%
 
-17.93999999999999 1 0.0%
 
Other values (665) 665 18.7%
 

Minimum 5 values

Value Count Frequency (%)  
-90.27 1 0.0%
 
-89.69166666666666 1 0.0%
 
-89.11333333333333 1 0.0%
 
-88.535 1 0.0%
 
-88.47166666666666 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
882.12 1 0.0%
 
887.98 1 0.0%
 
893.8399999999999 1 0.0%
 
899.6999999999999 1 0.0%
 
905.56 1 0.0%
 

Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric

Distinct count 685
Unique (%) 19.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -3.8153
Minimum -79.48
Maximum 261.66
Zeros (%) 0.0%

Quantile statistics

Minimum -79.48
5-th percentile -7.94
Q1 -7.94
Median -7.94
Q3 -7.94
95-th percentile 16.192
Maximum 261.66
Range 341.14
Interquartile range 0

Descriptive statistics

Standard deviation 20.964
Coef of variation -5.4947
Kurtosis 69.189
Mean -3.8153
MAD 8.4864
Skewness 7.1384
Sum -13537
Variance 439.48
Memory size 27.8 KiB
Value Count Frequency (%)  
-7.94 2856 80.5%
 
14.93 9 0.3%
 
-15.020833333333332 1 0.0%
 
4.810000000000002 1 0.0%
 
12.56375 1 0.0%
 
-28.35375 1 0.0%
 
22.085 1 0.0%
 
34.3975 1 0.0%
 
-0.12124999999999997 1 0.0%
 
7.221666666666667 1 0.0%
 
Other values (675) 675 19.0%
 

Minimum 5 values

Value Count Frequency (%)  
-79.48 1 0.0%
 
-77.04541666666668 1 0.0%
 
-74.61083333333335 1 0.0%
 
-72.17625000000001 1 0.0%
 
-69.74166666666667 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
235.16750000000002 1 0.0%
 
243.99833333333336 1 0.0%
 
247.44583333333338 1 0.0%
 
252.82916666666668 1 0.0%
 
261.66 1 0.0%
 

Value_SP500_REAL_PRICE_MONTH
Numeric

Distinct count 2827
Unique (%) 79.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 267.14
Minimum 2.73
Maximum 2789.8
Zeros (%) 0.0%

Quantile statistics

Minimum 2.73
5-th percentile 4.4
Q1 7.8138
Median 16.555
Q3 128.36
95-th percentile 1428.9
Maximum 2789.8
Range 2787.1
Interquartile range 120.55

Descriptive statistics

Standard deviation 533.7
Coef of variation 1.9978
Kurtosis 5.5084
Mean 267.14
MAD 366.21
Skewness 2.4362
Sum 947810
Variance 284830
Memory size 27.8 KiB
Value Count Frequency (%)  
7.68 11 0.3%
 
5.18 9 0.3%
 
4.37 9 0.3%
 
4.46 9 0.3%
 
2736.61 8 0.2%
 
5.25 8 0.2%
 
9.3 7 0.2%
 
4.54 7 0.2%
 
8.83 6 0.2%
 
5.33 6 0.2%
 
Other values (2817) 3468 97.7%
 

Minimum 5 values

Value Count Frequency (%)  
2.73 1 0.0%
 
2.79 1 0.0%
 
2.835 1 0.0%
 
2.85 1 0.0%
 
2.94 3 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
2736.61 8 0.2%
 
2745.48 1 0.0%
 
2747.48 1 0.0%
 
2754.35 1 0.0%
 
2789.8 1 0.0%
 

Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis

Correlation 0.9797

Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.96667

Value_SP500_REAL_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis

Correlation 0.99537

Value_SP500_REAL_SALES_YEAR
Highly correlated

This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis

Correlation 0.90087

Value_SP500_SALES_GROWTH_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis

Correlation 0.99017

Value_SP500_SALES_GROWTH_YEAR
Numeric

Distinct count 394
Unique (%) 11.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.64806
Minimum -12.86
Maximum 10.93
Zeros (%) 0.0%

Quantile statistics

Minimum -12.86
5-th percentile -1.18
Q1 -1.18
Median -1.18
Q3 -1.18
95-th percentile 5.1897
Maximum 10.93
Range 23.79
Interquartile range 0

Descriptive statistics

Standard deviation 2.4013
Coef of variation -3.7053
Kurtosis 11.202
Mean -0.64806
MAD 1.1652
Skewness 2.698
Sum -2299.3
Variance 5.766
Memory size 27.8 KiB
Value Count Frequency (%)  
-1.18 3144 88.6%
 
9.03 9 0.3%
 
3.92 2 0.1%
 
2.62 2 0.1%
 
3.68 2 0.1%
 
8.94 1 0.0%
 
6.684166666666667 1 0.0%
 
10.266666666666666 1 0.0%
 
1.44 1 0.0%
 
-2.6916666666666664 1 0.0%
 
Other values (384) 384 10.8%
 

Minimum 5 values

Value Count Frequency (%)  
-12.86 1 0.0%
 
-12.253333333333332 1 0.0%
 
-12.075 1 0.0%
 
-11.646666666666665 1 0.0%
 
-11.29 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.921666666666667 1 0.0%
 
10.92375 1 0.0%
 
10.925833333333333 1 0.0%
 
10.927916666666667 1 0.0%
 
10.93 1 0.0%
 

Value_SP500_SALES_QUARTER
Highly correlated

This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis

Correlation 0.99943

Value_SP500_SALES_YEAR
Numeric

Distinct count 421
Unique (%) 11.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 773.69
Minimum 674.59
Maximum 1292.8
Zeros (%) 0.0%

Quantile statistics

Minimum 674.59
5-th percentile 745.7
Q1 745.7
Median 745.7
Q3 745.7
95-th percentile 1036.8
Maximum 1292.8
Range 618.25
Interquartile range 0

Descriptive statistics

Standard deviation 96.243
Coef of variation 0.12439
Kurtosis 10.629
Mean 773.69
MAD 52.026
Skewness 3.3842
Sum 2745100
Variance 9262.8
Memory size 27.8 KiB
Value Count Frequency (%)  
745.7 3120 87.9%
 
1292.84 9 0.3%
 
1172.2266666666667 1 0.0%
 
791.7595833333332 1 0.0%
 
1140.8675 1 0.0%
 
870.7304166666668 1 0.0%
 
1240.7733333333333 1 0.0%
 
1236.1716666666666 1 0.0%
 
966.465 1 0.0%
 
775.2766666666666 1 0.0%
 
Other values (411) 411 11.6%
 

Minimum 5 values

Value Count Frequency (%)  
674.59 1 0.0%
 
676.0991666666667 1 0.0%
 
677.1854166666667 1 0.0%
 
677.6083333333333 1 0.0%
 
679.1175000000001 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1270.4 1 0.0%
 
1276.01 1 0.0%
 
1281.62 1 0.0%
 
1287.23 1 0.0%
 
1292.84 9 0.3%
 

Correlations

Sample

Value_SP500_REAL_PRICE_MONTH Value_SP500_DIV_YIELD_MONTH Value_SP500_PE_RATIO_MONTH Value_SHILLER_PE_RATIO_MONTH Value_SP500_EARNINGS_YIELD_MONTH Value_SP500_INFLADJ_MONTH Value_SP500_PSR_QUARTER Value_SP500_DIV_MONTH Value_SP500_DIV_YEAR Value_SP500_DIV_GROWTH_YEAR Value_SP500_DIV_GROWTH_QUARTER Value_SP500_PBV_RATIO_QUARTER Value_SHILLER_PE_RATIO_YEAR Value_SP500_PE_RATIO_YEAR Value_SP500_DIV_YIELD_YEAR Value_SP500_PSR_YEAR Value_SP500_EARNINGS_YIELD_YEAR Value_SP500_PBV_RATIO_YEAR Value_SP500_INFLADJ_YEAR Value_SP500_SALES_YEAR Value_SP500_SALES_GROWTH_YEAR Value_SP500_SALES_QUARTER Value_SP500_REAL_SALES_GROWTH_QUARTER Value_SP500_SALES_GROWTH_QUARTER Value_SP500_REAL_SALES_GROWTH_YEAR Value_SP500_REAL_EARNINGS_GROWTH_YEAR Value_SP500_REAL_SALES_YEAR Value_SP500_REAL_EARNINGS_GROWTH_QUARTER Value_SP500_EARNINGS_GROWTH_QUARTER Value_SP500_REAL_SALES_QUARTER Value_SP500_EARNINGS_MONTH Value_SP500_BVPS_YEAR Value_SP500_EARNINGS_YEAR Value_SP500_EARNINGS_GROWTH_YEAR Value_SP500_BVPS_QUARTER
Date
1871-01-01 4.440 5.86 11.100 10.920 9.010 89.81 1.77 5.260 5.150000 13.38 13.38 5.05 11.9 11.100000 5.860000 1.77 9.010000 5.05 89.8100 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 8.090 290.68 7.920000 -3.71 290.68
1871-01-31 4.470 5.86 11.175 10.920 8.950 89.07 1.77 5.260 5.150000 13.38 13.38 5.05 11.9 11.140417 5.860000 1.77 8.979583 5.05 90.1025 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 8.090 290.68 7.920000 -3.71 290.68
1871-02-01 4.500 5.82 11.250 10.920 8.890 88.33 1.77 5.180 5.151364 13.38 13.38 5.05 11.9 11.180833 5.843182 1.77 8.949167 5.05 90.3950 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.970 290.68 7.922273 -3.71 290.68
1871-02-28 4.555 5.78 11.385 11.055 8.785 88.75 1.77 5.100 5.152727 13.38 13.38 5.05 11.9 11.221250 5.826364 1.77 8.918750 5.05 90.6875 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.850 290.68 7.924545 -3.71 290.68
1871-03-01 4.610 5.71 11.520 11.190 8.680 89.17 1.77 5.065 5.154091 13.38 13.38 5.05 11.9 11.261667 5.809545 1.77 8.888333 5.05 90.9800 745.7 -1.18 745.7 -3.66 -1.18 -3.66 -7.94 1087.39 -7.94 -3.71 1087.39 7.795 290.68 7.926818 -3.71 290.68
In [31]:
autocorrelation_plot(df_imputed)
plt.show()
In [32]:
autocorrelation_plot(df_interpolate)
plt.show()
In [33]:
#Corelation plot after imputation
plt.figure(figsize=(20,15))
sns.heatmap(df_imputed.corr(),annot=True,fmt='.2f',square=False)
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x23b2c873b00>
In [34]:
#Correlation heatmap plot after interpolation
plt.figure(figsize=(20,15))
sns.heatmap(df_interpolate.corr(),annot=True,fmt='.2f',square=False)
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x23b2aa86400>
In [35]:
#Check for missing values per column and create a graph
Column_missing_values(df_imputed)
Column_missing_values(df_interpolate)
No missing values in provided dataframe
No missing values in provided dataframe
In [36]:
df_imputed.plot(figsize=(50,30),fontsize=30)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x23b2cda4a20>
In [37]:
df_interpolate.plot(figsize=(30,20),fontsize=20)
Out[37]:
<matplotlib.axes._subplots.AxesSubplot at 0x23b2d3dd240>
In [38]:
df.hist(figsize=(15,10))
Out[38]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D558B00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D58B4E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D5B2DD8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D5DF860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D60D320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D60D358>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D662860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D694320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D6B7DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D6E6860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D715320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D739DA0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D76A860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D79A320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D7BFDA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D7EE860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D81D320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D843DA0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D872860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8A3320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8C5DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8F5860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D924320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D94BDA0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D978860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D9A8320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB00DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB2E860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB5D320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30131DA0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30161860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30192320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B301B8DA0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B301E5860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30214320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3023ADA0>]],
      dtype=object)
In [39]:
df_imputed.hist(figsize=(30,20))
Out[39]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B302BD080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D4829E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B304DE080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305D30B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305F2B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305F2B70>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3063E0B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30663B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306935F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306C30B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306E8B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307195F8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307460B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3076AB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307985F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307CB0B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307EFB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3081F5F8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3084F0B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30873B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308A25F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308D30B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308F8B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309265F8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309560B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3097CB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309AB5F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309DB0B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309FFB38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A2F5F8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A5E0B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A83B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30AB35F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30AE40B8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30B08B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30B375F8>]],
      dtype=object)
In [40]:
#Histogram plot of all variables in interpolated dataset
df_interpolate.hist(figsize=(30,20))
Out[40]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3133FBA8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B331B7278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3312C3C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B33226BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B332476A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B332476D8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3169EBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B316CD6A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B316FD160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31721BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317516A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31781160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317A6BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317D46A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31803160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3182ABE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318586A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31888160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318ACBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318DE6A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3190C160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31932BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319616A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3198F160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319B4BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319E46A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A12160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A39BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A686A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A99160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31ABEBE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31AED6A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B1F160>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B41BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B726A0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31BA1160>]],
      dtype=object)

Final analysis on interpolation and imputation datasets.

linear Interpolation on missing values looks promising and there is better correlation between variables of the dataset. Imputation has not shown better correlation between variables. I would use interpolation dataframe for further analysis.

Write dfs to csv files for further reference.

save df, df_imputed and df_iterpolated to csv file.

In [41]:
#Write dfs to csv file.
df.to_csv('SandP500_Index_Master.csv',index=False)
df_imputed.to_csv('SandP500_Index_df_imputed.csv',index=False)
df_interpolate.to_csv('SandP500_Index_df_Interpolated.csv',index=False)

Feature Scaling

Apply StandardScaler() to the imputed df and interpolated df to normalize feature values.
In [42]:
# Apply feature scaling on all values to the entire numerical dataframe.define function
def Apply_Standard_Scaler(df):
    '''
    This funtion applies StandardScaler() to columns/features of a given dataframe.
    IN- a pandas dataframe
    OUT - df_scaled_features dataframe of scaled features.
    scaler - StandardScaler() object.    
    '''
    scaler = StandardScaler()
    scaled_features = StandardScaler().fit_transform(df.values)
    df_scaled_features = pd.DataFrame(scaled_features,index=df.index,columns = df.columns)
    #df_scaled_features.describe()
    
    return df_scaled_features, scaler
In [43]:
print ('Number of columns present in imputed and interpolated datasets are : {} & {}'.format (
    len(df_imputed.columns),len(df_interpolate.columns)))
Number of columns present in imputed and interpolated datasets are : 35 & 35
In [44]:
# Apply standard scaling function to both dfs and return scaled df.
df_scaled_features_imputed, scaler_df_imputed = Apply_Standard_Scaler(df_imputed)
df_scaled_features_interpolation, scaler_df_interpolate = Apply_Standard_Scaler(df_interpolate)

Apply Principal Component Analysis (PCA) for feature selection

Apply PCA feature extraction analysis to find groups of features with highest and lowest variance.
In [45]:
#function to apply PCA feature scaling
def scree_plot(pca):
    '''
    Creates a scree plot associated with the principal components 
    
    INPUT: pca - the result of instance of PCA in scikit learn
            
    OUTPUT:
            None
    '''
    num_components = len(pca.explained_variance_ratio_)
    ind = np.arange(num_components)
    vals = pca.explained_variance_ratio_
 
    plt.figure(figsize=(25, 10))
    ax = plt.subplot(111)
    cumvals = np.cumsum(vals)
    ax.bar(ind, vals)
    ax.plot(ind, cumvals)
    #print (ind, cumvals)
    for i in range(num_components):
        ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
     
 
    ax.xaxis.set_tick_params(width=0)
    ax.yaxis.set_tick_params(width=2, length=12)
 
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Variance Explained (%)")
    plt.title('Explained Variance Per Principal Component')
    

# Apply PCA to the data for all features
def Apply_PCA(df_scaled_features,n_components):
    '''
    This function would create a PCA object in scikit-learn with n_components and apply fit_transform function of PCA on scaled features df.
    This function internally calls another 'scree_plot' function to create a chart of feature variance.
    IN - 
    df_scaled_features - numerically scaled dataframe.
    n_components - n number of components for PCA feature analysis
    
    Output- Returns PCA object with provided n_components
    '''
    pca = PCA(n_components)
    pca_scaled_features = pca.fit_transform(df_scaled_features)
    scree_plot(pca)
    
    return pca


# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
    """
    Docstring- map the weights and components from PCA analysis.
    Input parameters-
    Input - pca initialized model
    ix = index number of first set of components.
    dataset = a dataframe of scaled features
    
    Output- A Tuple of features with variance.
    Prints a bar chart with feature names and related variance.
    """
    a1 = pca.components_[ix]
    a2 = dataset.keys().values
    a = list(zip(a1, a2))
    a.sort(key=lambda tup: tup[0])
    x_labels = [val[0] for val in a]
    y_labels = [val[1] for val in a]
    plt.Figure(figsize=(20, 15))
    ax = pd.Series(x_labels).plot(kind='bar')
    ax.set_xticklabels(y_labels)
    rects = ax.patches
    # for rect, label in zip(rects, x_labels):
    #     height = rect.get_height()
    #     ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
    return a


#Cluster of of features with highest variance
def Print_PCAfeatures_graph(df,a,n):
    '''
    Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
    IN- 
    df- dataframe on which PCA analysis was done.
    a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
    n- number of features required to be printed in the chart. Negative (-n) shall show features with positive variance while 
    positive (n) would show features with negative variance.
    
    OUT- name of features from cluster a.
    '''    
    groups = []
    for i in range(len(a)):
        groups.append(a[i][1])
    
    #Slice 1-D array appropriately
    if n < 0:
        groups = groups[n:]
    else:
        groups = groups[:n]
    
    #print(groups)
    
    i = 1
    for group in groups:
        plt.subplot(len(groups), 1, i)
        plt.plot(df[group].values)
        plt.title(group, y=0.5, loc='center')
        i += 1
    plt.figure(figsize=(30,20))
    plt.show()
    
    return groups
Visualize PCA feature extraction and variance for imputed and interpolated dfs
In [46]:
#Apply PCA for all features except target output.
pca_imputed = Apply_PCA(df_scaled_features_imputed.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
In [47]:
#Apply PCA for all features except target output.
pca_interpolated = Apply_PCA(df_scaled_features_interpolation.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
Visualize weights of features for first group of extracted features with highest variance
In [48]:
# #Cluster of of features with highest variance
# def Print_PCAfeatures_graph(df,a,n):
#     '''
#     Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
#     IN- 
#     df- dataframe on which PCA analysis was done.
#     a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
#     n- number of features required to be printed in the chart. Negative (-n) shall show features with maximum variance while 
#     positive n would show features with lowest variance.
    
#     OUT- name of features from cluster a.
#     '''    
#     groups = []
#     for i in range(len(a)):
#         groups.append(a[i][1])
    
#     #Slice 1-D array appropriately
#     if n < 0:
#         groups = groups[n:]
#     else:
#         groups = groups[:n]
    
#     #print(groups)
    
#     i = 1
#     for group in groups:
#         plt.subplot(len(groups), 1, i)
#         plt.plot(df[group].values)
#         plt.title(group, y=0.5, loc='center')
#         i += 1
#     plt.figure(figsize=(30,20))
#     plt.show()
    
#     return groups
In [49]:
#List cluster of features with highest variance from PCA analysis
a = sorted_weights(pca_imputed,1,df_scaled_features_imputed)
In [50]:
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_imputed,a,-5)
<Figure size 3000x2000 with 0 Axes>
Out[50]:
['Value_SHILLER_PE_RATIO_YEAR',
 'Value_SP500_EARNINGS_YIELD_MONTH',
 'Value_SP500_PE_RATIO_MONTH',
 'Value_SP500_PBV_RATIO_YEAR',
 'Value_SP500_PBV_RATIO_QUARTER']
In [51]:
#print chart with top 5 features with lowest variance
Print_PCAfeatures_graph(df_imputed,a,5)
<Figure size 3000x2000 with 0 Axes>
Out[51]:
['Value_SHILLER_PE_RATIO_MONTH',
 'Value_SP500_PSR_QUARTER',
 'Value_SP500_REAL_SALES_QUARTER',
 'Value_SP500_PSR_YEAR',
 'Value_SP500_REAL_SALES_GROWTH_QUARTER']
In [52]:
b = sorted_weights(pca_interpolated,1,df_scaled_features_interpolation)
b
Out[52]:
[(-0.3099270964617886, 'Value_SP500_DIV_GROWTH_YEAR'),
 (-0.30472102931433814, 'Value_SP500_DIV_YEAR'),
 (-0.24703832789002747, 'Value_SP500_REAL_SALES_GROWTH_QUARTER'),
 (-0.23874586737089795, 'Value_SP500_SALES_YEAR'),
 (-0.2383254455928687, 'Value_SP500_SALES_QUARTER'),
 (-0.22862267710171758, 'Value_SP500_SALES_GROWTH_QUARTER'),
 (-0.2276975766969936, 'Value_SP500_EARNINGS_GROWTH_QUARTER'),
 (-0.2205363310620414, 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR'),
 (-0.18570532863264552, 'Value_SHILLER_PE_RATIO_MONTH'),
 (-0.18030144089471137, 'Value_SP500_PSR_YEAR'),
 (-0.14675080518992814, 'Value_SP500_SALES_GROWTH_YEAR'),
 (-0.14261472452590546, 'Value_SP500_INFLADJ_YEAR'),
 (-0.13310173004335377, 'Value_SP500_PE_RATIO_YEAR'),
 (-0.13165915021671745, 'Value_SP500_REAL_PRICE_MONTH'),
 (-0.1275332154522852, 'Value_SP500_INFLADJ_MONTH'),
 (-0.11806250612104319, 'Value_SP500_EARNINGS_GROWTH_YEAR'),
 (-0.1177629782635305, 'Value_SP500_EARNINGS_MONTH'),
 (-0.11682443659890858, 'Value_SP500_DIV_YIELD_YEAR'),
 (-0.06950409932183431, 'Value_SP500_REAL_SALES_QUARTER'),
 (-0.0560163591773991, 'Value_SP500_BVPS_YEAR'),
 (-0.027145876828283743, 'Value_SP500_PSR_QUARTER'),
 (-0.02606010953354377, 'Value_SP500_DIV_MONTH'),
 (-0.007376927107057215, 'Value_SP500_DIV_GROWTH_QUARTER'),
 (-0.002314480823033611, 'Value_SP500_EARNINGS_YIELD_YEAR'),
 (0.022568969230881045, 'Value_SP500_EARNINGS_YIELD_MONTH'),
 (0.025265445913567188, 'Value_SP500_PBV_RATIO_YEAR'),
 (0.12553339929260882, 'Value_SP500_PE_RATIO_MONTH'),
 (0.12877894460506153, 'Value_SP500_PBV_RATIO_QUARTER'),
 (0.1652084917363736, 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER'),
 (0.1688368159634833, 'Value_SP500_REAL_SALES_YEAR'),
 (0.20153174127326504, 'Value_SHILLER_PE_RATIO_YEAR'),
 (0.21577231387568555, 'Value_SP500_EARNINGS_YEAR'),
 (0.2169762289480813, 'Value_SP500_DIV_YIELD_MONTH'),
 (0.22110502456487266, 'Value_SP500_REAL_SALES_GROWTH_YEAR')]
In [53]:
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_interpolate,b,-6)
<Figure size 3000x2000 with 0 Axes>
Out[53]:
['Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
 'Value_SP500_REAL_SALES_YEAR',
 'Value_SHILLER_PE_RATIO_YEAR',
 'Value_SP500_EARNINGS_YEAR',
 'Value_SP500_DIV_YIELD_MONTH',
 'Value_SP500_REAL_SALES_GROWTH_YEAR']
In [54]:
Print_PCAfeatures_graph(df_interpolate,b,8)
<Figure size 3000x2000 with 0 Axes>
Out[54]:
['Value_SP500_DIV_GROWTH_YEAR',
 'Value_SP500_DIV_YEAR',
 'Value_SP500_REAL_SALES_GROWTH_QUARTER',
 'Value_SP500_SALES_YEAR',
 'Value_SP500_SALES_QUARTER',
 'Value_SP500_SALES_GROWTH_QUARTER',
 'Value_SP500_EARNINGS_GROWTH_QUARTER',
 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR']

Split dataset into Train and Test set.

Split 80% of dataset into train and 20% into test based on chronological order because its a timeseries. Avoid look-ahead bias by doing this.
In [55]:
def Create_Training_Test_Dataset(df,split_percent,Linear_regr):
    '''
    This function would split, slice and create training and test datasets. Provide 'Value_SP500_REAL_PRICE_MONTH' in the input 
    df. It would input to X
    
    IN- df- a dataframe from which training and test dataset needs to slice.
    split_percent - split percent for training and test dataset.
    Linear_regr - A flag to split between dataset for linear regression object or LSTM network. Works for this project only.
    
    OUT- X_train, Y_train, X_test, Y_test
    
    '''
       
    # Split the size into 80% and 20% based on rows.
    train_size = int(len(df) * split_percent)
    test_size = len(df) - train_size
    #print(train_size,test_size)
    print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))
    
    #Slice the df into train and test df.
    train = df.iloc[0:train_size,:]
    test = df.iloc[train_size:len(df),:]
    #print(train.shape, test.shape)
    
    #Check for Linear Regression flag from user input. If false it would return dataset for LSTM neural network.
    if Linear_regr == 'False':
        #Create Training dataset
        temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
        X_train = temp_train.iloc[0:train_size,:]
        #X_train.head()
        Y_train = train.iloc[0:train_size,:1]
        #print(Y_train.head())
        #print(X_train.shape,Y_train.shape)
        #print(X_train.columns,Y_train.columns)
        print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))

        #Define Test dataset
        temp_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
        X_test = temp_test.iloc[0:test_size,:]
        #X_train.head()
        Y_test = test.iloc[0:test_size,:1]
        #print(Y_train.head())
        #print(X_test.shape,Y_test.shape)
        #print(X_test.columns,Y_test.columns)
        print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
        
    else:
        #Train dataset
        X_train = train['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
        Y_train = train['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
        print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))
            
        #Test dataset
        X_test = test['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
        Y_test = test['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
        print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
        

    return X_train, Y_train, X_test, Y_test
    
def Convert_dataset_nparray(X_train, Y_train, X_test, Y_test):
    '''
    This function would convert the training and test dataset to np.array.
    In- X_train, Y_train, X_test, Y_test
    
    OUT- np.array of X_train, Y_train, X_test, Y_test
    
    '''
    #Convert to np array as required for LSTM model.
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    #print(X_train.shape,Y_train.shape)
    print('Training dataset is converted to np.array with size {} & {}'.format(X_train.shape,Y_train.shape))

    #Convert test dataset to np.array.
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)
    #print(X_test.shape,Y_test.shape)
    print('Test dataset is converted to np.array with size {} & {}'.format(X_test.shape,Y_test.shape))
    
    return X_train, Y_train, X_test, Y_test

Define and Fit LSTM model in keras

Multivariate time-series prediction
In [56]:
from subprocess import check_output
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import LSTM, CuDNNLSTM , BatchNormalization
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint


import time
from numpy import newaxis
Using TensorFlow backend.
In [57]:
# #Build Model
# model = Sequential()

# model.add(LSTM(input_dim=1,output_dim=50,return_sequences=True))
# model.add(Dropout(0.2))

# model.add(LSTM(100,return_sequences=False))
# model.add(Dropout(0.2))

# model.add(Dense(output_dim=1))
# model.add(Activation('linear'))

# start = time.time()
# model.compile(loss='mse', optimizer='rmsprop')
# print ('compilation time : ', time.time() - start)
In [58]:
# #Build the model
# model = Sequential()
# model.add(LSTM(256,input_shape=(2837,34)))
# model.add(Dense(1))
# model.compile(optimizer='adam',loss='mse')
# #Reshape data for (Sample,Timestep,Features) 
# #X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],1))
# #X_test = X_test.reshape((X_test.shape[0],X_test.shape[1],1))
# #Fit model with history to check for overfitting
# #history = model.fit(X_train,y_train,epochs=300,validation_data=(X_test,y_test),shuffle=False)
In [59]:
#model.fit(X_train,Y_train,batch_size=128,epochs=10,validation_split=0.05)
In [60]:
# model = Sequential([
#     Dense(32, input_shape=(2837,34)),
#     Activation('relu'),
#     Dense(10),
#     Activation('softmax'),
# ])

# #Compile
# # For a mean squared error regression problem
# model.compile(optimizer='rmsprop',
#               loss='mse')
In [61]:
df_interpolate.columns
Out[61]:
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
       'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
       'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
       'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
       'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
       'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
       'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
       'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
       'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
       'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
       'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_QUARTER',
       'Value_SP500_SALES_GROWTH_QUARTER',
       'Value_SP500_REAL_SALES_GROWTH_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
       'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
       'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
       'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
       'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
       'Value_SP500_BVPS_QUARTER'],
      dtype='object')
In [62]:
#Not using PCA features here because the model resulted in huge mse errors. I think monthly features should be used which could 
#predict the real price correctly.
df_PCA_features = df_interpolate.loc[:,['Value_SP500_REAL_PRICE_MONTH',
                                        'Value_SP500_DIV_YIELD_MONTH',
                                        'Value_SP500_PE_RATIO_MONTH',
                                        'Value_SHILLER_PE_RATIO_MONTH',
                                        'Value_SP500_EARNINGS_YIELD_MONTH',
                                        'Value_SP500_INFLADJ_MONTH',
                                        'Value_SP500_EARNINGS_MONTH','Value_SP500_PSR_QUARTER','Value_SP500_SALES_QUARTER',
                                       'Value_SP500_REAL_SALES_GROWTH_QUARTER','Value_SP500_REAL_EARNINGS_GROWTH_QUARTER']]

df_PCA_features.shape
Out[62]:
(3548, 11)
In [63]:
#apply scaling
scaler_df_imputed = StandardScaler()
scaled_features = scaler_df_imputed.fit_transform(df_PCA_features.values)
df_scaled_features_interpol = pd.DataFrame(scaled_features,index=df_PCA_features.index,columns = df_PCA_features.columns)
#df_scaled_features.describe()
In [64]:
#Use real data
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.95, Linear_regr='False')
#X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
In [65]:
#Split master dataframe into training and test datasets
X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.98, Linear_regr='False')

X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
Training and Test dataset is of size 3477 & 71
Features size of X_train and training target Y_train shape is (3477, 10) & (3477, 1)
Features size of X_test and Test target Y_test shape is (71, 10) & (71, 1)
Training dataset is converted to np.array with size (3477, 10) & (3477, 1)
Test dataset is converted to np.array with size (71, 10) & (71, 1)
In [66]:
#Shape training data.
#the inputs (X) are reshaped into the 3D format expected by LSTM, namely [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0],1,10))
#Y_train = Y_train.reshape((Y_train.shape[0],1,1)) #Do not reshape it. 
print(X_train.shape,Y_train.shape)
print(X_train.shape[1],X_train.shape[2])

#Shape test dataset correctly for LSTM predict.
X_test = X_test.reshape((X_test.shape[0],1,10))
#Y_test = Y_test.reshape((Y_test.shape[0],Y_test.shape[1],1))
print(X_test.shape,Y_test.shape)
print('X_test input shape : ', X_test.shape[0],X_test.shape[1],X_test.shape[2])
(3477, 1, 10) (3477, 1)
1 10
(71, 1, 10) (71, 1)
X_test input shape :  71 1 10
In [67]:
# #Design network for mean squared error regression problem
# model = Sequential()
# model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(Dropout(0.2))
# Activation('relu')
# model.add(Dense(1))
# Activation('softmax')

# # Compiling the model using mean square error loss, and Adam optimizer.
# model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
# #model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
In [68]:
# max_features = 1024

# model = Sequential()
# model.add(Embedding(max_features, output_dim=256))
# model.add(LSTM(128))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation='sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='rmsprop',
#               metrics=['accuracy'])
In [69]:
#Design network for mean squared error regression problem
model = Sequential()
#The first dimension is supposed to be each sample.input should be (n_samples, timesteps, n_features)
model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

# model.add(LSTM(512, input_shape=(X_train.shape[0],X_train.shape[2]),return_sequences=True))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())

Activation('relu')
model.add(Dense(1))

Activation('softmax')

# Compiling the model using mean square error loss, and Adam optimizer.
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
#model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
filepath = "RNN_final-{epoch:02d}-{val_acc:.3f}"#unique file name which will include epochs and validation accuracy score.
checkpoint = ModelCheckpoint("models/{}.model".format(filepath,monitor = 'val_acc',verbose = 1))

print(model.summary)
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x0000023B40B28C88>>
In [70]:
# fit network with epochs
history = model.fit(X_train, Y_train, epochs=200, batch_size=100, validation_data=(X_train, Y_train), verbose=2,
                    callbacks = [checkpoint],
                    shuffle=False)

model.save('LSTM_model',overwrite=True,include_optimizer=True)
Train on 3477 samples, validate on 3477 samples
Epoch 1/200
 - 5s - loss: 244238.9797 - acc: 0.0000e+00 - val_loss: 242630.7920 - val_acc: 0.0000e+00
Epoch 2/200
 - 2s - loss: 243637.9757 - acc: 0.0000e+00 - val_loss: 242623.0742 - val_acc: 0.0000e+00
Epoch 3/200
 - 2s - loss: 242933.8914 - acc: 2.8760e-04 - val_loss: 242195.8168 - val_acc: 0.0000e+00
Epoch 4/200
 - 2s - loss: 242000.9474 - acc: 2.8760e-04 - val_loss: 241721.6815 - val_acc: 0.0000e+00
Epoch 5/200
 - 2s - loss: 240848.0547 - acc: 0.0000e+00 - val_loss: 240140.1107 - val_acc: 2.8760e-04
Epoch 6/200
 - 2s - loss: 239475.0473 - acc: 0.0000e+00 - val_loss: 240018.5771 - val_acc: 0.0000e+00
Epoch 7/200
 - 2s - loss: 237921.6260 - acc: 0.0000e+00 - val_loss: 239362.4484 - val_acc: 2.8760e-04
Epoch 8/200
 - 2s - loss: 236189.1215 - acc: 2.8760e-04 - val_loss: 238873.2472 - val_acc: 0.0000e+00
Epoch 9/200
 - 2s - loss: 234283.8113 - acc: 0.0000e+00 - val_loss: 236238.2314 - val_acc: 0.0000e+00
Epoch 10/200
 - 2s - loss: 232263.7305 - acc: 0.0000e+00 - val_loss: 234679.4462 - val_acc: 2.8760e-04
Epoch 11/200
 - 2s - loss: 230159.0793 - acc: 0.0000e+00 - val_loss: 231780.6211 - val_acc: 0.0000e+00
Epoch 12/200
 - 2s - loss: 227963.8670 - acc: 0.0000e+00 - val_loss: 228800.3780 - val_acc: 0.0000e+00
Epoch 13/200
 - 2s - loss: 225788.3079 - acc: 0.0000e+00 - val_loss: 226632.0599 - val_acc: 0.0000e+00
Epoch 14/200
 - 3s - loss: 223606.1505 - acc: 0.0000e+00 - val_loss: 223809.9382 - val_acc: 0.0000e+00
Epoch 15/200
 - 3s - loss: 221416.7447 - acc: 0.0000e+00 - val_loss: 220388.9384 - val_acc: 0.0000e+00
Epoch 16/200
 - 3s - loss: 219306.1292 - acc: 0.0000e+00 - val_loss: 216452.5520 - val_acc: 0.0000e+00
Epoch 17/200
 - 3s - loss: 217239.6921 - acc: 0.0000e+00 - val_loss: 213895.7212 - val_acc: 0.0000e+00
Epoch 18/200
 - 3s - loss: 215266.2501 - acc: 0.0000e+00 - val_loss: 211874.5399 - val_acc: 0.0000e+00
Epoch 19/200
 - 3s - loss: 213426.6784 - acc: 0.0000e+00 - val_loss: 209590.2366 - val_acc: 2.8760e-04
Epoch 20/200
 - 3s - loss: 211654.5152 - acc: 0.0000e+00 - val_loss: 206945.6682 - val_acc: 0.0000e+00
Epoch 21/200
 - 3s - loss: 210006.4334 - acc: 0.0000e+00 - val_loss: 204139.0336 - val_acc: 0.0000e+00
Epoch 22/200
 - 2s - loss: 208480.0654 - acc: 0.0000e+00 - val_loss: 202818.2083 - val_acc: 0.0000e+00
Epoch 23/200
 - 3s - loss: 207095.6864 - acc: 0.0000e+00 - val_loss: 200138.3704 - val_acc: 0.0000e+00
Epoch 24/200
 - 3s - loss: 205760.5758 - acc: 2.8760e-04 - val_loss: 198417.4672 - val_acc: 0.0000e+00
Epoch 25/200
 - 2s - loss: 204575.2013 - acc: 0.0000e+00 - val_loss: 196761.1546 - val_acc: 0.0000e+00
Epoch 26/200
 - 2s - loss: 203502.4613 - acc: 0.0000e+00 - val_loss: 195262.1544 - val_acc: 0.0000e+00
Epoch 27/200
 - 2s - loss: 202528.4124 - acc: 0.0000e+00 - val_loss: 194745.8543 - val_acc: 0.0000e+00
Epoch 28/200
 - 3s - loss: 201706.8706 - acc: 0.0000e+00 - val_loss: 193888.8870 - val_acc: 0.0000e+00
Epoch 29/200
 - 3s - loss: 200873.6037 - acc: 0.0000e+00 - val_loss: 193091.4885 - val_acc: 0.0000e+00
Epoch 30/200
 - 3s - loss: 200161.1630 - acc: 0.0000e+00 - val_loss: 193206.9390 - val_acc: 0.0000e+00
Epoch 31/200
 - 3s - loss: 199572.3630 - acc: 0.0000e+00 - val_loss: 192240.0993 - val_acc: 0.0000e+00
Epoch 32/200
 - 3s - loss: 199035.0737 - acc: 0.0000e+00 - val_loss: 191247.9004 - val_acc: 0.0000e+00
Epoch 33/200
 - 2s - loss: 198521.9556 - acc: 0.0000e+00 - val_loss: 190547.4664 - val_acc: 0.0000e+00
Epoch 34/200
 - 2s - loss: 198099.8821 - acc: 0.0000e+00 - val_loss: 189714.8254 - val_acc: 0.0000e+00
Epoch 35/200
 - 3s - loss: 197701.3073 - acc: 0.0000e+00 - val_loss: 188941.1377 - val_acc: 0.0000e+00
Epoch 36/200
 - 3s - loss: 197376.9153 - acc: 0.0000e+00 - val_loss: 188377.7941 - val_acc: 2.8760e-04
Epoch 37/200
 - 3s - loss: 197044.9920 - acc: 0.0000e+00 - val_loss: 187045.7611 - val_acc: 0.0000e+00
Epoch 38/200
 - 2s - loss: 196795.9062 - acc: 0.0000e+00 - val_loss: 187351.3920 - val_acc: 0.0000e+00
Epoch 39/200
 - 2s - loss: 196572.3703 - acc: 0.0000e+00 - val_loss: 187124.7167 - val_acc: 0.0000e+00
Epoch 40/200
 - 3s - loss: 196370.9548 - acc: 0.0000e+00 - val_loss: 186639.7503 - val_acc: 0.0000e+00
Epoch 41/200
 - 3s - loss: 196227.8874 - acc: 0.0000e+00 - val_loss: 186408.3390 - val_acc: 0.0000e+00
Epoch 42/200
 - 2s - loss: 196061.2889 - acc: 0.0000e+00 - val_loss: 186077.8087 - val_acc: 0.0000e+00
Epoch 43/200
 - 2s - loss: 195928.0746 - acc: 0.0000e+00 - val_loss: 185542.2910 - val_acc: 0.0000e+00
Epoch 44/200
 - 3s - loss: 195864.5363 - acc: 0.0000e+00 - val_loss: 185105.1968 - val_acc: 0.0000e+00
Epoch 45/200
 - 3s - loss: 195753.8908 - acc: 0.0000e+00 - val_loss: 184844.2340 - val_acc: 0.0000e+00
Epoch 46/200
 - 3s - loss: 195698.9140 - acc: 0.0000e+00 - val_loss: 184650.5630 - val_acc: 0.0000e+00
Epoch 47/200
 - 3s - loss: 195616.3337 - acc: 0.0000e+00 - val_loss: 184245.7706 - val_acc: 0.0000e+00
Epoch 48/200
 - 3s - loss: 195569.5281 - acc: 0.0000e+00 - val_loss: 184300.0953 - val_acc: 0.0000e+00
Epoch 49/200
 - 3s - loss: 195521.6131 - acc: 0.0000e+00 - val_loss: 184786.1297 - val_acc: 0.0000e+00
Epoch 50/200
 - 3s - loss: 195468.6009 - acc: 0.0000e+00 - val_loss: 185139.8302 - val_acc: 0.0000e+00
Epoch 51/200
 - 3s - loss: 195454.3358 - acc: 0.0000e+00 - val_loss: 185025.3426 - val_acc: 0.0000e+00
Epoch 52/200
 - 3s - loss: 195445.4346 - acc: 0.0000e+00 - val_loss: 184578.5638 - val_acc: 0.0000e+00
Epoch 53/200
 - 3s - loss: 195396.9004 - acc: 0.0000e+00 - val_loss: 184158.2218 - val_acc: 0.0000e+00
Epoch 54/200
 - 3s - loss: 195407.0587 - acc: 0.0000e+00 - val_loss: 184341.6772 - val_acc: 0.0000e+00
Epoch 55/200
 - 3s - loss: 195399.8430 - acc: 0.0000e+00 - val_loss: 184529.7913 - val_acc: 0.0000e+00
Epoch 56/200
 - 3s - loss: 195348.8731 - acc: 0.0000e+00 - val_loss: 184536.9332 - val_acc: 0.0000e+00
Epoch 57/200
 - 3s - loss: 195358.3157 - acc: 0.0000e+00 - val_loss: 184368.8823 - val_acc: 0.0000e+00
Epoch 58/200
 - 3s - loss: 195324.2318 - acc: 0.0000e+00 - val_loss: 183566.8680 - val_acc: 0.0000e+00
Epoch 59/200
 - 3s - loss: 195324.9126 - acc: 0.0000e+00 - val_loss: 183373.8943 - val_acc: 0.0000e+00
Epoch 60/200
 - 3s - loss: 195321.1821 - acc: 0.0000e+00 - val_loss: 183972.8905 - val_acc: 0.0000e+00
Epoch 61/200
 - 3s - loss: 195354.8061 - acc: 0.0000e+00 - val_loss: 184067.9746 - val_acc: 0.0000e+00
Epoch 62/200
 - 3s - loss: 195325.6220 - acc: 0.0000e+00 - val_loss: 183365.7582 - val_acc: 0.0000e+00
Epoch 63/200
 - 2s - loss: 195325.1537 - acc: 0.0000e+00 - val_loss: 183448.1593 - val_acc: 0.0000e+00
Epoch 64/200
 - 2s - loss: 195361.3605 - acc: 0.0000e+00 - val_loss: 184158.1261 - val_acc: 0.0000e+00
Epoch 65/200
 - 2s - loss: 195327.4322 - acc: 0.0000e+00 - val_loss: 184253.0312 - val_acc: 0.0000e+00
Epoch 66/200
 - 2s - loss: 195378.1483 - acc: 0.0000e+00 - val_loss: 184558.3813 - val_acc: 0.0000e+00
Epoch 67/200
 - 2s - loss: 195374.7326 - acc: 0.0000e+00 - val_loss: 183383.6095 - val_acc: 0.0000e+00
Epoch 68/200
 - 2s - loss: 195368.7838 - acc: 0.0000e+00 - val_loss: 183544.9586 - val_acc: 0.0000e+00
Epoch 69/200
 - 3s - loss: 195334.8143 - acc: 0.0000e+00 - val_loss: 184147.1688 - val_acc: 0.0000e+00
Epoch 70/200
 - 3s - loss: 195389.6477 - acc: 0.0000e+00 - val_loss: 184665.8255 - val_acc: 0.0000e+00
Epoch 71/200
 - 2s - loss: 195363.5043 - acc: 0.0000e+00 - val_loss: 184763.5030 - val_acc: 0.0000e+00
Epoch 72/200
 - 2s - loss: 195373.3171 - acc: 0.0000e+00 - val_loss: 184667.6454 - val_acc: 0.0000e+00
Epoch 73/200
 - 2s - loss: 195379.7082 - acc: 0.0000e+00 - val_loss: 184943.3379 - val_acc: 0.0000e+00
Epoch 74/200
 - 2s - loss: 195392.4984 - acc: 0.0000e+00 - val_loss: 184610.1339 - val_acc: 0.0000e+00
Epoch 75/200
 - 2s - loss: 195357.4133 - acc: 0.0000e+00 - val_loss: 184848.9067 - val_acc: 0.0000e+00
Epoch 76/200
 - 2s - loss: 195429.5762 - acc: 0.0000e+00 - val_loss: 185251.1268 - val_acc: 0.0000e+00
Epoch 77/200
 - 2s - loss: 195427.6759 - acc: 0.0000e+00 - val_loss: 185617.7255 - val_acc: 0.0000e+00
Epoch 78/200
 - 2s - loss: 195412.7640 - acc: 0.0000e+00 - val_loss: 185401.8593 - val_acc: 0.0000e+00
Epoch 79/200
 - 3s - loss: 195408.7144 - acc: 0.0000e+00 - val_loss: 185220.5007 - val_acc: 0.0000e+00
Epoch 80/200
 - 3s - loss: 195395.4735 - acc: 0.0000e+00 - val_loss: 185333.0465 - val_acc: 0.0000e+00
Epoch 81/200
 - 3s - loss: 195431.5948 - acc: 0.0000e+00 - val_loss: 185281.1637 - val_acc: 0.0000e+00
Epoch 82/200
 - 3s - loss: 195427.2975 - acc: 0.0000e+00 - val_loss: 185063.8881 - val_acc: 0.0000e+00
Epoch 83/200
 - 2s - loss: 195429.2891 - acc: 0.0000e+00 - val_loss: 185142.5852 - val_acc: 0.0000e+00
Epoch 84/200
 - 2s - loss: 195459.0527 - acc: 0.0000e+00 - val_loss: 184590.9984 - val_acc: 0.0000e+00
Epoch 85/200
 - 2s - loss: 195448.4422 - acc: 0.0000e+00 - val_loss: 184182.4285 - val_acc: 0.0000e+00
Epoch 86/200
 - 2s - loss: 195424.7728 - acc: 0.0000e+00 - val_loss: 184104.9516 - val_acc: 0.0000e+00
Epoch 87/200
 - 2s - loss: 195400.6353 - acc: 0.0000e+00 - val_loss: 183871.4632 - val_acc: 0.0000e+00
Epoch 88/200
 - 3s - loss: 195432.5526 - acc: 0.0000e+00 - val_loss: 183814.1315 - val_acc: 0.0000e+00
Epoch 89/200
 - 3s - loss: 195453.0358 - acc: 2.8760e-04 - val_loss: 184666.0870 - val_acc: 0.0000e+00
Epoch 90/200
 - 2s - loss: 195464.1884 - acc: 0.0000e+00 - val_loss: 184499.2254 - val_acc: 0.0000e+00
Epoch 91/200
 - 2s - loss: 195448.0775 - acc: 0.0000e+00 - val_loss: 184666.1427 - val_acc: 0.0000e+00
Epoch 92/200
 - 2s - loss: 195447.6317 - acc: 0.0000e+00 - val_loss: 185652.8650 - val_acc: 0.0000e+00
Epoch 93/200
 - 3s - loss: 195436.8737 - acc: 0.0000e+00 - val_loss: 185552.3937 - val_acc: 0.0000e+00
Epoch 94/200
 - 3s - loss: 195437.4609 - acc: 0.0000e+00 - val_loss: 185486.7145 - val_acc: 0.0000e+00
Epoch 95/200
 - 2s - loss: 195483.3410 - acc: 0.0000e+00 - val_loss: 185325.4870 - val_acc: 0.0000e+00
Epoch 96/200
 - 2s - loss: 195458.6826 - acc: 0.0000e+00 - val_loss: 184841.7381 - val_acc: 0.0000e+00
Epoch 97/200
 - 2s - loss: 195438.1483 - acc: 0.0000e+00 - val_loss: 184822.1032 - val_acc: 0.0000e+00
Epoch 98/200
 - 3s - loss: 195474.9970 - acc: 0.0000e+00 - val_loss: 184452.8845 - val_acc: 0.0000e+00
Epoch 99/200
 - 3s - loss: 195479.1822 - acc: 0.0000e+00 - val_loss: 184494.5250 - val_acc: 0.0000e+00
Epoch 100/200
 - 2s - loss: 195469.4960 - acc: 0.0000e+00 - val_loss: 184210.1910 - val_acc: 0.0000e+00
Epoch 101/200
 - 2s - loss: 195463.7877 - acc: 0.0000e+00 - val_loss: 184259.6424 - val_acc: 0.0000e+00
Epoch 102/200
 - 2s - loss: 195489.6614 - acc: 0.0000e+00 - val_loss: 184673.6306 - val_acc: 0.0000e+00
Epoch 103/200
 - 2s - loss: 195468.4255 - acc: 0.0000e+00 - val_loss: 184837.2438 - val_acc: 0.0000e+00
Epoch 104/200
 - 2s - loss: 195466.5730 - acc: 0.0000e+00 - val_loss: 184831.9443 - val_acc: 0.0000e+00
Epoch 105/200
 - 2s - loss: 195461.7147 - acc: 0.0000e+00 - val_loss: 184811.8820 - val_acc: 0.0000e+00
Epoch 106/200
 - 2s - loss: 195495.6439 - acc: 0.0000e+00 - val_loss: 184535.9261 - val_acc: 0.0000e+00
Epoch 107/200
 - 3s - loss: 195478.5014 - acc: 0.0000e+00 - val_loss: 183850.6027 - val_acc: 0.0000e+00
Epoch 108/200
 - 3s - loss: 195485.0055 - acc: 0.0000e+00 - val_loss: 184228.0488 - val_acc: 0.0000e+00
Epoch 109/200
 - 3s - loss: 195490.1562 - acc: 0.0000e+00 - val_loss: 184049.4651 - val_acc: 0.0000e+00
Epoch 110/200
 - 2s - loss: 195463.5472 - acc: 0.0000e+00 - val_loss: 183979.9637 - val_acc: 0.0000e+00
Epoch 111/200
 - 3s - loss: 195466.2002 - acc: 0.0000e+00 - val_loss: 183922.0163 - val_acc: 0.0000e+00
Epoch 112/200
 - 2s - loss: 195494.2449 - acc: 0.0000e+00 - val_loss: 184098.9795 - val_acc: 0.0000e+00
Epoch 113/200
 - 2s - loss: 195467.4129 - acc: 2.8760e-04 - val_loss: 183923.9146 - val_acc: 0.0000e+00
Epoch 114/200
 - 3s - loss: 195478.9303 - acc: 0.0000e+00 - val_loss: 184233.8636 - val_acc: 0.0000e+00
Epoch 115/200
 - 2s - loss: 195492.8755 - acc: 0.0000e+00 - val_loss: 183678.9782 - val_acc: 0.0000e+00
Epoch 116/200
 - 3s - loss: 195476.1045 - acc: 0.0000e+00 - val_loss: 183999.5662 - val_acc: 0.0000e+00
Epoch 117/200
 - 3s - loss: 195475.0067 - acc: 0.0000e+00 - val_loss: 183750.6926 - val_acc: 0.0000e+00
Epoch 118/200
 - 2s - loss: 195444.1771 - acc: 0.0000e+00 - val_loss: 183531.4881 - val_acc: 0.0000e+00
Epoch 119/200
 - 3s - loss: 195456.0990 - acc: 0.0000e+00 - val_loss: 183617.5793 - val_acc: 0.0000e+00
Epoch 120/200
 - 2s - loss: 195466.2133 - acc: 0.0000e+00 - val_loss: 183802.5115 - val_acc: 0.0000e+00
Epoch 121/200
 - 3s - loss: 195437.4059 - acc: 0.0000e+00 - val_loss: 183594.7202 - val_acc: 0.0000e+00
Epoch 122/200
 - 3s - loss: 195479.0137 - acc: 0.0000e+00 - val_loss: 183594.9440 - val_acc: 0.0000e+00
Epoch 123/200
 - 2s - loss: 195445.1567 - acc: 0.0000e+00 - val_loss: 183830.1620 - val_acc: 0.0000e+00
Epoch 124/200
 - 2s - loss: 195485.2744 - acc: 0.0000e+00 - val_loss: 183234.6731 - val_acc: 0.0000e+00
Epoch 125/200
 - 2s - loss: 195475.8863 - acc: 0.0000e+00 - val_loss: 183765.6606 - val_acc: 0.0000e+00
Epoch 126/200
 - 2s - loss: 195461.2404 - acc: 2.8760e-04 - val_loss: 183565.6058 - val_acc: 0.0000e+00
Epoch 127/200
 - 2s - loss: 195482.5538 - acc: 0.0000e+00 - val_loss: 183641.1930 - val_acc: 0.0000e+00
Epoch 128/200
 - 2s - loss: 195472.1918 - acc: 0.0000e+00 - val_loss: 183727.4728 - val_acc: 0.0000e+00
Epoch 129/200
 - 2s - loss: 195476.9520 - acc: 0.0000e+00 - val_loss: 183403.4754 - val_acc: 0.0000e+00
Epoch 130/200
 - 2s - loss: 195470.1123 - acc: 0.0000e+00 - val_loss: 183650.5799 - val_acc: 0.0000e+00
Epoch 131/200
 - 2s - loss: 195467.0799 - acc: 0.0000e+00 - val_loss: 183422.5880 - val_acc: 0.0000e+00
Epoch 132/200
 - 2s - loss: 195477.9288 - acc: 0.0000e+00 - val_loss: 183731.2103 - val_acc: 0.0000e+00
Epoch 133/200
 - 2s - loss: 195453.9476 - acc: 0.0000e+00 - val_loss: 183664.7018 - val_acc: 0.0000e+00
Epoch 134/200
 - 2s - loss: 195459.7911 - acc: 2.8760e-04 - val_loss: 183258.9522 - val_acc: 0.0000e+00
Epoch 135/200
 - 3s - loss: 195477.6894 - acc: 0.0000e+00 - val_loss: 183250.0353 - val_acc: 0.0000e+00
Epoch 136/200
 - 2s - loss: 195470.4907 - acc: 2.8760e-04 - val_loss: 183325.1750 - val_acc: 0.0000e+00
Epoch 137/200
 - 2s - loss: 195457.6549 - acc: 0.0000e+00 - val_loss: 183128.0413 - val_acc: 0.0000e+00
Epoch 138/200
 - 2s - loss: 195451.7374 - acc: 2.8760e-04 - val_loss: 183095.3022 - val_acc: 0.0000e+00
Epoch 139/200
 - 2s - loss: 195430.9589 - acc: 0.0000e+00 - val_loss: 183068.1891 - val_acc: 0.0000e+00
Epoch 140/200
 - 2s - loss: 195472.4346 - acc: 0.0000e+00 - val_loss: 183043.8016 - val_acc: 0.0000e+00
Epoch 141/200
 - 3s - loss: 195474.6245 - acc: 0.0000e+00 - val_loss: 183072.5931 - val_acc: 0.0000e+00
Epoch 142/200
 - 2s - loss: 195439.9469 - acc: 0.0000e+00 - val_loss: 183135.6248 - val_acc: 0.0000e+00
Epoch 143/200
 - 2s - loss: 195441.8985 - acc: 0.0000e+00 - val_loss: 183021.7210 - val_acc: 0.0000e+00
Epoch 144/200
 - 2s - loss: 195442.6170 - acc: 0.0000e+00 - val_loss: 182893.0097 - val_acc: 0.0000e+00
Epoch 145/200
 - 2s - loss: 195439.8753 - acc: 0.0000e+00 - val_loss: 182915.6292 - val_acc: 0.0000e+00
Epoch 146/200
 - 2s - loss: 195436.0277 - acc: 0.0000e+00 - val_loss: 182777.1501 - val_acc: 0.0000e+00
Epoch 147/200
 - 2s - loss: 195434.7522 - acc: 0.0000e+00 - val_loss: 182753.7661 - val_acc: 0.0000e+00
Epoch 148/200
 - 2s - loss: 195457.6565 - acc: 0.0000e+00 - val_loss: 182612.4348 - val_acc: 0.0000e+00
Epoch 149/200
 - 2s - loss: 195495.7157 - acc: 0.0000e+00 - val_loss: 182550.5300 - val_acc: 0.0000e+00
Epoch 150/200
 - 2s - loss: 195449.4474 - acc: 0.0000e+00 - val_loss: 182546.5075 - val_acc: 0.0000e+00
Epoch 151/200
 - 2s - loss: 195457.4294 - acc: 0.0000e+00 - val_loss: 181868.5405 - val_acc: 0.0000e+00
Epoch 152/200
 - 2s - loss: 195460.9698 - acc: 2.8760e-04 - val_loss: 182062.7935 - val_acc: 0.0000e+00
Epoch 153/200
 - 2s - loss: 195457.5152 - acc: 0.0000e+00 - val_loss: 181863.8373 - val_acc: 0.0000e+00
Epoch 154/200
 - 2s - loss: 195466.1971 - acc: 0.0000e+00 - val_loss: 182173.9323 - val_acc: 0.0000e+00
Epoch 155/200
 - 2s - loss: 195444.8498 - acc: 0.0000e+00 - val_loss: 181869.6481 - val_acc: 0.0000e+00
Epoch 156/200
 - 2s - loss: 195453.1903 - acc: 0.0000e+00 - val_loss: 181605.8907 - val_acc: 0.0000e+00
Epoch 157/200
 - 2s - loss: 195489.6974 - acc: 0.0000e+00 - val_loss: 182259.6924 - val_acc: 0.0000e+00
Epoch 158/200
 - 2s - loss: 195466.0687 - acc: 0.0000e+00 - val_loss: 182189.6657 - val_acc: 0.0000e+00
Epoch 159/200
 - 2s - loss: 195485.9728 - acc: 0.0000e+00 - val_loss: 182403.9244 - val_acc: 0.0000e+00
Epoch 160/200
 - 3s - loss: 195506.8130 - acc: 0.0000e+00 - val_loss: 182244.1800 - val_acc: 0.0000e+00
Epoch 161/200
 - 3s - loss: 195480.0568 - acc: 0.0000e+00 - val_loss: 182473.5691 - val_acc: 0.0000e+00
Epoch 162/200
 - 2s - loss: 195496.4917 - acc: 0.0000e+00 - val_loss: 182636.2321 - val_acc: 0.0000e+00
Epoch 163/200
 - 2s - loss: 195471.3256 - acc: 0.0000e+00 - val_loss: 182680.6537 - val_acc: 2.8760e-04
Epoch 164/200
 - 2s - loss: 195473.1113 - acc: 0.0000e+00 - val_loss: 182696.0740 - val_acc: 0.0000e+00
Epoch 165/200
 - 2s - loss: 195477.8964 - acc: 0.0000e+00 - val_loss: 182575.0914 - val_acc: 2.8760e-04
Epoch 166/200
 - 2s - loss: 195487.1710 - acc: 0.0000e+00 - val_loss: 182742.1346 - val_acc: 0.0000e+00
Epoch 167/200
 - 2s - loss: 195460.1076 - acc: 0.0000e+00 - val_loss: 182596.5257 - val_acc: 0.0000e+00
Epoch 168/200
 - 3s - loss: 195465.9434 - acc: 0.0000e+00 - val_loss: 182656.0113 - val_acc: 0.0000e+00
Epoch 169/200
 - 2s - loss: 195463.6720 - acc: 2.8760e-04 - val_loss: 181938.2320 - val_acc: 0.0000e+00
Epoch 170/200
 - 2s - loss: 195480.6142 - acc: 0.0000e+00 - val_loss: 181441.6512 - val_acc: 0.0000e+00
Epoch 171/200
 - 2s - loss: 195475.3307 - acc: 0.0000e+00 - val_loss: 181792.5836 - val_acc: 0.0000e+00
Epoch 172/200
 - 2s - loss: 195456.1476 - acc: 0.0000e+00 - val_loss: 182147.3755 - val_acc: 0.0000e+00
Epoch 173/200
 - 2s - loss: 195475.8040 - acc: 0.0000e+00 - val_loss: 182177.5330 - val_acc: 0.0000e+00
Epoch 174/200
 - 2s - loss: 195479.9504 - acc: 0.0000e+00 - val_loss: 182208.6537 - val_acc: 0.0000e+00
Epoch 175/200
 - 2s - loss: 195459.6837 - acc: 0.0000e+00 - val_loss: 182122.0784 - val_acc: 0.0000e+00
Epoch 176/200
 - 2s - loss: 195467.3283 - acc: 2.8760e-04 - val_loss: 181848.8619 - val_acc: 0.0000e+00
Epoch 177/200
 - 2s - loss: 195468.4745 - acc: 0.0000e+00 - val_loss: 181862.0922 - val_acc: 0.0000e+00
Epoch 178/200
 - 2s - loss: 195471.8972 - acc: 0.0000e+00 - val_loss: 181886.4019 - val_acc: 0.0000e+00
Epoch 179/200
 - 2s - loss: 195460.1161 - acc: 0.0000e+00 - val_loss: 181814.8222 - val_acc: 0.0000e+00
Epoch 180/200
 - 2s - loss: 195499.5465 - acc: 0.0000e+00 - val_loss: 181819.5692 - val_acc: 0.0000e+00
Epoch 181/200
 - 2s - loss: 195454.7540 - acc: 0.0000e+00 - val_loss: 181887.8555 - val_acc: 0.0000e+00
Epoch 182/200
 - 2s - loss: 195487.3822 - acc: 0.0000e+00 - val_loss: 181821.7993 - val_acc: 0.0000e+00
Epoch 183/200
 - 2s - loss: 195455.0825 - acc: 0.0000e+00 - val_loss: 181856.4182 - val_acc: 0.0000e+00
Epoch 184/200
 - 2s - loss: 195496.6770 - acc: 0.0000e+00 - val_loss: 181768.0757 - val_acc: 0.0000e+00
Epoch 185/200
 - 2s - loss: 195461.5927 - acc: 0.0000e+00 - val_loss: 181683.1072 - val_acc: 0.0000e+00
Epoch 186/200
 - 2s - loss: 195495.0217 - acc: 0.0000e+00 - val_loss: 181700.0158 - val_acc: 0.0000e+00
Epoch 187/200
 - 2s - loss: 195445.7431 - acc: 0.0000e+00 - val_loss: 181766.8667 - val_acc: 0.0000e+00
Epoch 188/200
 - 2s - loss: 195463.6909 - acc: 0.0000e+00 - val_loss: 181982.3083 - val_acc: 0.0000e+00
Epoch 189/200
 - 2s - loss: 195477.2432 - acc: 0.0000e+00 - val_loss: 181940.3596 - val_acc: 0.0000e+00
Epoch 190/200
 - 2s - loss: 195464.1343 - acc: 0.0000e+00 - val_loss: 181823.8806 - val_acc: 0.0000e+00
Epoch 191/200
 - 2s - loss: 195452.2148 - acc: 0.0000e+00 - val_loss: 181949.4636 - val_acc: 0.0000e+00
Epoch 192/200
 - 2s - loss: 195469.9830 - acc: 0.0000e+00 - val_loss: 181792.7081 - val_acc: 0.0000e+00
Epoch 193/200
 - 2s - loss: 195459.5354 - acc: 0.0000e+00 - val_loss: 181454.7131 - val_acc: 0.0000e+00
Epoch 194/200
 - 3s - loss: 195448.5514 - acc: 0.0000e+00 - val_loss: 181652.3696 - val_acc: 0.0000e+00
Epoch 195/200
 - 2s - loss: 195465.3238 - acc: 0.0000e+00 - val_loss: 181647.2812 - val_acc: 0.0000e+00
Epoch 196/200
 - 2s - loss: 195464.8206 - acc: 0.0000e+00 - val_loss: 181621.8350 - val_acc: 0.0000e+00
Epoch 197/200
 - 2s - loss: 195439.5109 - acc: 0.0000e+00 - val_loss: 181538.7109 - val_acc: 0.0000e+00
Epoch 198/200
 - 2s - loss: 195468.4378 - acc: 0.0000e+00 - val_loss: 181387.1260 - val_acc: 0.0000e+00
Epoch 199/200
 - 2s - loss: 195460.6704 - acc: 0.0000e+00 - val_loss: 181443.1030 - val_acc: 0.0000e+00
Epoch 200/200
 - 2s - loss: 195466.0657 - acc: 0.0000e+00 - val_loss: 181374.3545 - val_acc: 0.0000e+00
In [71]:
# plot history
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test val_loss')
plt.legend()
plt.show()
In [72]:
y_pred_lstm_imputed = model.predict(X_test)
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)
Out[72]:
[<matplotlib.lines.Line2D at 0x23b452ded30>]
In [73]:
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)

plt.title('LSTM predicted vs actual values')
#plt.xlabel('Actual Price')
#plt.ylabel('Predicted Price')
#plt.legend(Y_test,y_pred_lstm_imputed)
plt.show()
In [74]:
print(Y_test.shape,X_test.shape,y_pred_lstm_imputed.shape)
(71, 1) (71, 1, 10) (71, 1)

Check and updated shape of X_test. Scale X_train and x_test on correct shape of dataframe.

In [75]:
#Inverse transform the prediction and Y_test
X_test = X_test.reshape((X_test.shape[0],X_test.shape[2]))

# Concatenate with X_test to reshape before inverse transform
y_pred_lstm_imputed = concatenate((y_pred_lstm_imputed,X_test),axis=1)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)

#Invert scaling for forecast
y_pred_lstm_imputed = scaler_df_imputed.inverse_transform(y_pred_lstm_imputed)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)

#slice reverse transformed y_pred values to compare with Y_test actual values
y_pred_lstm_imputed = y_pred_lstm_imputed[:,0:1]
(71, 10) (71, 11) (71, 1)
(71, 10) (71, 11) (71, 1)
In [76]:
#Reconsturct Y_test with X_test and inverse transform it.
Y_test = concatenate((Y_test,X_test),axis=1)
Y_test = scaler_df_imputed.inverse_transform(Y_test)
#slice reverse transformed y_pred values to compare with Y_test actual values
Y_test = Y_test[:,0:1]
In [77]:
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)
(71, 10) (71, 1) (71, 1)

Evaluate Model LSTM model

In [78]:
#Calculate MSE between actual values Y_test and inverse transformed y_pred
mse = mean_squared_error(Y_test,y_pred_lstm_imputed)
print('Means Square Error between Y_test adn prediction values is :', mse)
Means Square Error between Y_test adn prediction values is : 1307257985747.0803
In [79]:
print('predicted- {} ; Actual - {}'.format(y_pred_lstm_imputed[-1], Y_test[-1]))
predicted- [143851.07772796] ; Actual - [1460579.96104185]
In [80]:
r2score = r2_score(Y_test,y_pred_lstm_imputed)
print('R2 score between Y_test adn prediction value is :', r2score)
R2 score between Y_test adn prediction value is : -60.31341629150638
In [81]:
model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 512)               1071104   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 512)               2048      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
=================================================================
Total params: 1,073,665
Trainable params: 1,072,641
Non-trainable params: 1,024
_________________________________________________________________

Apply Linear Regression model

In [82]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
In [83]:
#Convert Date index to column
#df_imputed.reset_index(inplace=True)

Plotting the Least Squares Line

Best observed high variance features Vs target label 'Value_SP500_REAL_PRICE_MONTH' to understand linear corelation.

This is to find colinearity between features and the prediction line for selected set of features. Predicted line would try to fit with linear regression model through the feature data points.

The dataset here is very imbalanced and non linear which makes it different to predict a linear line with optimum coefficients and intercept.
In [84]:
 #5 features with highest variance.
max_var_cols1 = Print_PCAfeatures_graph(df_imputed,a,-10)

# visualize relationship between the features and the target price using scatterplots
sns.pairplot(df_imputed, x_vars=max_var_cols1,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
Out[84]:
<seaborn.axisgrid.PairGrid at 0x23b41f41390>
In [85]:
 #5 feature with highest PCA variance.
max_var_cols2 = Print_PCAfeatures_graph(df_interpolate,b,-10)
print(max_var_cols2)

# visualize relationship between the features and the target price using seaborn scatterplot. 
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols2,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_PBV_RATIO_YEAR', 'Value_SP500_PE_RATIO_MONTH', 'Value_SP500_PBV_RATIO_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_YEAR', 'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_EARNINGS_YEAR', 'Value_SP500_DIV_YIELD_MONTH', 'Value_SP500_REAL_SALES_GROWTH_YEAR']
Out[85]:
<seaborn.axisgrid.PairGrid at 0x23b48418c50>
In [86]:
 #5 feature with highest PCA variance.
max_var_cols3 = Print_PCAfeatures_graph(df_interpolate,b,10)
print(max_var_cols3)

# visualize relationship between the features and the target price using seaborn scatterplot. 
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols3,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_DIV_GROWTH_YEAR', 'Value_SP500_DIV_YEAR', 'Value_SP500_REAL_SALES_GROWTH_QUARTER', 'Value_SP500_SALES_YEAR', 'Value_SP500_SALES_QUARTER', 'Value_SP500_SALES_GROWTH_QUARTER', 'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SHILLER_PE_RATIO_MONTH', 'Value_SP500_PSR_YEAR']
Out[86]:
<seaborn.axisgrid.PairGrid at 0x23b4d9d5780>

Split into training and test dataset for Linear Regression

In [87]:
#Split data into train and test
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df_imputed,0.8,Linear_regr='True')
In [88]:
scaler = StandardScaler()
scaled_interpolated = scaler.fit_transform(df_PCA_features.values)
df_scaled_interpolated = pd.DataFrame(scaled_interpolated,index=df_PCA_features.index,columns = df_PCA_features.columns)
In [89]:
# Split the size into 80% and 20% row-wise
split_percent = 0.8
train_size = int(len(df_PCA_features) * split_percent)
test_size = len(df_PCA_features) - train_size
#print(train_size,test_size)
print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))

#Slice the df into train and test df.
train = df_PCA_features.iloc[0:train_size,:]
test = df_PCA_features.iloc[train_size:len(df),:]
print(train.shape, test.shape)
Training and Test dataset is of size 2838 & 710
(2838, 11) (710, 11)
In [90]:
# #Slice the df into train and test df.
# train = df_scaled_interpolated.iloc[0:train_size,:]
# test = df_scaled_interpolated.iloc[train_size:len(df),:]
# print(train.shape, test.shape)
In [91]:
# Slice Train dataset.
temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
X_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_train = train['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))

#Test dataset
X_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_test = test['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
Features size of X_train and training target Y_train shape is (2838, 10) & (2838,)
Features size of X_test and Test target Y_test shape is (710, 10) & (710,)
In [92]:
#np.array(temp_train[:]).reshape(-1,1)
#print((np.array(temp_train[:]).reshape(-1,2)).shape)

Instantiate linear regression model from scikit-learn

In [93]:
# Instantiate linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)
Out[93]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

Interpreting model coefficients

In [94]:
# print the intercept and coefficients
print(regr.intercept_)
print(regr.coef_)
137.11463021015638
[-3.89875853e+00 -5.67269650e-01 -5.46924799e+00 -6.41997252e+00
  1.29380410e-01  2.46530686e+00  0.00000000e+00 -3.98730436e-02
  1.55754077e-04  1.55754077e-04]

Making Predictions

In [95]:
#Linear regression prediction on testing set
y_pred_inter_regr = regr.predict(X_test)
In [96]:
print(y_pred_inter_regr.shape,X_test.shape,Y_test.shape)
(710,) (710, 10) (710,)

Evaluate

Model evaluation metrics for linear regression

In [97]:
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are: 
Out[97]:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
 ('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
 ('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
 ('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
 ('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
 ('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
 ('Value_SP500_PSR_QUARTER', 0.0),
 ('Value_SP500_SALES_QUARTER', -0.03987304360362631),
 ('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
 ('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]
In [98]:
# The coefficients
#print('Coefficients: \n', regr.coef_)

# The mean squared error:  is always non-negative, and values closer to zero are better.
print('Mean squared error: {}'.format(mean_squared_error(Y_test, y_pred_inter_regr)))

# The Root mean squared error: RMSE is popular than MSE, because RMSE is interpretable in the "y" units.
print('Root Mean squared error: {}'.format(np.sqrt(mean_squared_error(Y_test, y_pred_inter_regr))))

# Explained variance score: 1 is perfect prediction
print('Variance R2 score: {}'.format(r2_score(Y_test, y_pred_inter_regr)))
print('intercept: ',regr.intercept_)
print('Score : ',regr.score)
Mean squared error: 1076172.6990871658
Root Mean squared error: 1037.3874392372243
Variance R2 score: -1.9382920775190278
intercept:  137.11463021015638
Score :  <bound method RegressorMixin.score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)>
In [99]:
#Reshape prediction and test set to concatenate with X_test to get origninal form of test dataset.
y_pred_inter_regr = y_pred_inter_regr[:,np.newaxis]
Y_test = Y_test[:,np.newaxis]
print(X_train.shape,X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
y_pred_inter_regr = concatenate((y_pred_inter_regr, X_test), axis=1)
print(X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
(2838, 10) (710, 10) (710, 1) (710, 1)
(710, 10) (710, 11) (710, 1)
In [100]:
# # Plot outputs
# for col in feature_cols:
#     plt.scatter(X_test[col], Y_test)
#     #plt.plot(X_test[col], y_pred_imputed_regr)
#     #print(col)

# plt.xticks(())
# plt.yticks(())
# plt.show()
In [101]:
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are: 
Out[101]:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
 ('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
 ('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
 ('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
 ('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
 ('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
 ('Value_SP500_PSR_QUARTER', 0.0),
 ('Value_SP500_SALES_QUARTER', -0.03987304360362631),
 ('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
 ('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]

MSE, RMSE and R2 score are very bad with all features and PCA selected features.

In [105]:
from fbprophet import Prophet
# plt.style.available
plt.style.use("seaborn-whitegrid")
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import statsmodels.api as sm
from scipy import stats
In [106]:
#Reset index of df to make it a column as prerequisite for fbProphet.
df_fb = df_imputed.reset_index()
df_interpolate_fb = df_interpolate.reset_index()
df_interpolate_fb.tail() #for further use.

#Change dtype of Date column to datetime64 for fbProphet
df_interpolate_fb.loc[:,'Date'] = pd.to_datetime(df_interpolate_fb.loc[:,'Date'],format = '%Y%m%d')
In [107]:
#Plotting of S&P interpolated price vs date 
# First Subplot
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,5))
ax1.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_PRICE_MONTH'],color ='blue')
#ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price")
ax1.set_title("S&P 500 Index price History")

# #Second Subplot- original plot
# ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
# ax1.set_xlabel("Date", fontsize=12)
# ax1.set_ylabel("Original S&P 500 Real Index price")
# ax1.set_title("S&P 500 Index price History")


#Third subplot
ax2.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_SALES_GROWTH_YEAR'],color='orange')
ax2.set_xlabel("Date", fontsize=12)
ax2.set_ylabel("SP500_REAL_SALES_GROWTH_YEAR")
ax2.set_title("SP500_REAL_SALES_GROWTH_YEAR")
plt.show()

Instantiate fbProphet

In [108]:
#Data preparation for fbprophet model. model takes date/timeseries and target label as input for fit and predict
df_interpolate_fb = df_interpolate_fb[['Date','Value_SP500_REAL_PRICE_MONTH']]

#Rename column names to 'ds' and 'y' as required by fbprophet model.
df_interpolate_fb.rename(columns={'Date':'ds','Value_SP500_REAL_PRICE_MONTH' : 'y'},inplace=True)

print(df_interpolate_fb.info(),df_interpolate_fb.shape)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548 entries, 0 to 3547
Data columns (total 2 columns):
ds    3548 non-null datetime64[ns]
y     3548 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 55.5 KB
None (3548, 2)
In [109]:
fb = Prophet()
In [110]:
fb.fit(df_interpolate_fb)
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
C:\Users\Akshat\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning:

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Out[110]:
<fbprophet.forecaster.Prophet at 0x23b4f130048>
In [111]:
#Create future dates
future_dates = fb.make_future_dataframe(periods=30)

#Predict prices for future dates
future_price = fb.predict(future_dates)
In [112]:
future_price[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
Out[112]:
ds yhat yhat_lower yhat_upper
3573 2019-01-26 2104.546587 1953.286430 2264.258040
3574 2019-01-27 2109.010824 1955.294901 2260.299157
3575 2019-01-28 2109.505319 1957.157929 2260.275119
3576 2019-01-29 2111.732478 1960.086347 2277.521839
3577 2019-01-30 2115.552914 1960.619623 2279.389418
In [113]:
import matplotlib.dates as mdates
In [114]:
# Dates
starting_date = dt.datetime(2018, 11, 30)
starting_date1 = mdates.date2num(starting_date)
trend_date = dt.datetime(2019, 1, 1)
trend_date1 = mdates.date2num(trend_date)

pointing_arrow = dt.datetime(2019, 1, 1)
pointing_arrow1 = mdates.date2num(pointing_arrow)

# Plot.
fig = fb.plot(future_price)
ax1 = fig.add_subplot(111)
ax1.set_title("S&P500 Index Price Forecast", fontsize=16)
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price", fontsize=12)

# Forecast initialization arrow
ax1.annotate('Forecast \n Initialization', xy=(pointing_arrow1, 2100), xytext=(starting_date1,2500),
            arrowprops=dict(facecolor='#ff7f50', shrink=0.1),
            )

# Trend emphasis arrow
ax1.annotate('Upward Trend', xy=(trend_date1, 2108), xytext=(trend_date1,950),
            arrowprops=dict(facecolor='#6cff6c', shrink=0.1),
            )

ax1.axhline(y=1260, color='b', linestyle='-')

plt.show()
C:\Users\Akshat\Anaconda3\lib\site-packages\matplotlib\cbook\deprecation.py:107: MatplotlibDeprecationWarning:

Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.

In [115]:
fig2 = fb.plot_components(future_price)
plt.show()
In [116]:
# Monthly Data Predictions
fbm = Prophet(changepoint_prior_scale=0.01).fit(df_interpolate_fb)
future = fbm.make_future_dataframe(periods=12, freq='M')
fcst = fbm.predict(future)
fig = fbm.plot(fcst)
plt.title("Monthly Prediction \n 1 year time frame")

plt.show()
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
In [117]:
fig3 = fbm.plot_components(fcst)
plt.show()